In [1]:
import xmltodict
import pandas as pd
import os
import tarfile
import shutil

In [78]:
def servicesContractNotice(doc):
    d = dict()
    if 'EN' in doc['TECHNICAL_SECTION']['FORM_LG_LIST']:
        EN_POS = doc['TECHNICAL_SECTION']['FORM_LG_LIST'].split(' ').index('EN')
    else:
        EN_POS = 0

    d['DOC_ID'] = doc['@DOC_ID']
    
    if '@VERSION' in doc.keys():
        d['VERSION'] = doc['@VERSION']
    elif '@VERSION' in doc['FORM_SECTION']['OTH_NOT'][EN_POS].keys():
        d['VERSION'] = doc['FORM_SECTION']['OTH_NOT'][EN_POS]['@VERSION']
        
    d['EDITION'] = doc['@EDITION']
    
    # CODED_DATA_SECTION - REF_OJS
    d['COLL_OJ'] = doc['CODED_DATA_SECTION']['REF_OJS']['COLL_OJ']
    d['No_OJ'] = doc['CODED_DATA_SECTION']['REF_OJS']['NO_OJ']
    d['DATE_PUB'] = doc['CODED_DATA_SECTION']['REF_OJS']['DATE_PUB']
    
    # CODED_DATA_SECTION - NOTICE_DATA
    d['NO_DOC_OJS'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['NO_DOC_OJS']
    d['URI_LIST'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['URI_LIST']['URI_DOC'][EN_POS]['#text']
    d['LG_ORIG'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['LG_ORIG']
    d['ISO_COUNTRY'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['ISO_COUNTRY']['@VALUE']
    d['ORIGINAL_CPV_CODE'] = ';'.join(cpv['@CODE'] for cpv in doc['CODED_DATA_SECTION']['NOTICE_DATA']['ORIGINAL_CPV']) if isinstance(doc['CODED_DATA_SECTION']['NOTICE_DATA']['ORIGINAL_CPV'], list) else doc['CODED_DATA_SECTION']['NOTICE_DATA']['ORIGINAL_CPV']['@CODE']
    d['ORIGINAL_CPV'] = ';'.join(cpv['#text'] for cpv in doc['CODED_DATA_SECTION']['NOTICE_DATA']['ORIGINAL_CPV']) if isinstance(doc['CODED_DATA_SECTION']['NOTICE_DATA']['ORIGINAL_CPV'], list) else doc['CODED_DATA_SECTION']['NOTICE_DATA']['ORIGINAL_CPV']['#text']
    d['n2016-PERFORMANCE_NUTS_CODE'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['n2016:PERFORMANCE_NUTS']['@CODE'] if 'n2016:PERFORMANCE_NUTS' in doc['CODED_DATA_SECTION']['NOTICE_DATA'].keys() else ''
    d['n2016-PERFORMANCE_NUTS'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['n2016:PERFORMANCE_NUTS']['#text'] if 'n2016:PERFORMANCE_NUTS' in doc['CODED_DATA_SECTION']['NOTICE_DATA'].keys() else ''
    d['n2016-CA_CE_NUTS_CODE'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['n2016:CA_CE_NUTS']['@CODE'] if 'n2016:CA_CE_NUTS' in doc['CODED_DATA_SECTION']['NOTICE_DATA'].keys() else ''
    d['n2016-CA_CE_NUTS'] = doc['CODED_DATA_SECTION']['NOTICE_DATA']['n2016:CA_CE_NUTS']['#text'] if 'n2016:CA_CE_NUTS' in doc['CODED_DATA_SECTION']['NOTICE_DATA'].keys() else ''
    
    # CODED_DATA_SECTION - CODIF_DATA
    d['DS_DATE_DISPATCH'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['DS_DATE_DISPATCH']
    d['DT_DATE_FOR_SUBMISSION'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['DT_DATE_FOR_SUBMISSION']
    d['AA_AUTHORITY_TYPE'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['AA_AUTHORITY_TYPE']['#text']
    d['TD_DOCUMENT_TYPE'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['TD_DOCUMENT_TYPE']['#text']
    d['NC_CONTRACT_NATURE'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['NC_CONTRACT_NATURE']['#text']
    d['PR_PROC'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['PR_PROC']['#text']
    d['RP_REGULATION'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['RP_REGULATION']['#text']
    d['TY_TYPE_BID'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['TY_TYPE_BID']['#text']
    d['AC_AWARD_CRIT'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['AC_AWARD_CRIT']['#text']
    d['MA_MAIN_ACTIVITIES'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['MA_MAIN_ACTIVITIES']['#text'] if 'MA_MAIN_ACTIVITIES' in doc['CODED_DATA_SECTION']['CODIF_DATA'].keys() else ''
    d['HEADING'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['HEADING']
    d['INITIATOR'] = doc['CODED_DATA_SECTION']['CODIF_DATA']['INITIATOR'] if 'INITIATOR' in doc['CODED_DATA_SECTION']['CODIF_DATA'].keys() else ''
    
    # TRANSLATION_SECTION - ML_TITLES
    d['TITLE'] = doc['TRANSLATION_SECTION']['ML_TITLES']['ML_TI_DOC'][EN_POS]['TI_TEXT']['P']
    d['CITY'] = doc['TRANSLATION_SECTION']['ML_TITLES']['ML_TI_DOC'][EN_POS]['TI_CY']
    d['TOWN'] = doc['TRANSLATION_SECTION']['ML_TITLES']['ML_TI_DOC'][EN_POS]['TI_TOWN']
    
    # TRANSLATION_SECTION - ML_AA_NAMES
    d['CONTRACTING AUTHORITY'] = doc['TRANSLATION_SECTION']['ML_AA_NAMES']['AA_NAME'][EN_POS]['#text']
    
    # FORM_SECTION
    # (F02_2014)
    if 'F02_2014' in doc['FORM_SECTION'].keys():
        d['FORM'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['@FORM']
        d['LEGAL_BASIS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['LEGAL_BASIS']['@VALUE']
        d['CONTRACTING_BODY'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['ADDRESS_CONTRACTING_BODY']['OFFICIALNAME']
        d['CONTRACTING_BODY_ADDRESS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['ADDRESS_CONTRACTING_BODY']['ADDRESS']
        d['CONTRACTING_BODY_TOWN'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['ADDRESS_CONTRACTING_BODY']['TOWN']
        d['CONTRACTING_BODY_POSTAL_CODE'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['ADDRESS_CONTRACTING_BODY']['POSTAL_CODE']
        d['CONTRACTING_BODY_COUONTRY'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['ADDRESS_CONTRACTING_BODY']['COUNTRY']['@VALUE']
        d['n2016-NUTS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['ADDRESS_CONTRACTING_BODY']['n2016:NUTS']['@CODE']
        d['CA_TYPE'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['CA_TYPE']['@VALUE']
        d['CA_ACTIVITY'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['CONTRACTING_BODY']['CA_ACTIVITY']['@VALUE']
        d['TITLE'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['TITLE']['P']
        d['REFERENCE_NUMBER'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['REFERENCE_NUMBER']
        d['CPV_MAIN'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['CPV_MAIN']['CPV_CODE']['@CODE']
        d['TYPE_CONTRACT'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['TYPE_CONTRACT']['@CTYPE']
        d['SHORT_DESCR'] = ' '.join(doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['SHORT_DESCR']['P'])
        d['NO_LOT_DIVISION'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['NO_LOT_DIVISION']
        d['OBJECT_DESCR_CPV_ADDITIONAL'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['OBJECT_DESCR']['CPV_ADDITIONAL']['CPV_CODE']['@CODE']
        d['OBJECT_DESCR_n2016-NUTS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['OBJECT_DESCR']['n2016:NUTS']['@CODE']
        d['OBJECT_DESCR_SHORT_DESCR'] = ' '.join(doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['OBJECT_DESCR']['SHORT_DESCR']['P'])
        d['OBJECT_DESCR_DURATION'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['OBJECT_DESCR']['DURATION']['#text'] + ' ' + doc['FORM_SECTION']['F02_2014'][EN_POS]['OBJECT_CONTRACT']['OBJECT_DESCR']['DURATION']['@TYPE']
        d['PROCEDURE_DATE_RECEIPT_TENDERS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['PROCEDURE']['DATE_RECEIPT_TENDERS']
        d['PROCEDURE_TIME_RECEIPT_TENDERS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['PROCEDURE']['TIME_RECEIPT_TENDERS']
        d['PROCEDURE_DURATION_TENDER_VALID'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['PROCEDURE']['DURATION_TENDER_VALID']['#text'] + ' ' + doc['FORM_SECTION']['F02_2014'][EN_POS]['PROCEDURE']['DURATION_TENDER_VALID']['@TYPE']
        d['PROCEDURE_DATE_OPENING_TENDERS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['PROCEDURE']['OPENING_CONDITION']['DATE_OPENING_TENDERS']
        d['PROCEDURE_TIME_OPENING_TENDERS'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['PROCEDURE']['OPENING_CONDITION']['TIME_OPENING_TENDERS']
        d['DATE_DISPATCH_NOTICE'] = doc['FORM_SECTION']['F02_2014'][EN_POS]['COMPLEMENTARY_INFO']['DATE_DISPATCH_NOTICE']
    # FORM_SECTION
    # (OTH_NOT)
    elif 'OTH_NOT' in doc['FORM_SECTION'].keys():
        for content in doc['FORM_SECTION']['OTH_NOT'][EN_POS]['FD_OTH_NOT']['CONTENTS']['GR_SEQ']:
            for each in content['BLK_BTX_SEQ']['MARK_LIST']['MLI_OCCUR']:
                if each['TI_MARK'] == 'Contracting Authority':
                    text = []
                    for element in each['TXT_MARK']['P']:
                        if isinstance(element, dict):
                            for k, val in element.items():
                                if isinstance(val, dict):
                                    for v in list(val.values()):
                                        if isinstance(v, list):
                                            text.append(''.join(v))
                                        else:
                                            text.append(v)
                                else:
                                    text.append(v)
                        elif isinstance(element, list):
                            for l in element:
                                text.append(l)
                        else:
                            text.append(element)
                    d[each['TI_MARK']] = ''.join(text)
                else:
                    d[each['TI_MARK']] = ''.join(each['TXT_MARK']['P'])

    return d

In [79]:
# RUNNER >>
directory = 'FTP_Data/2019/2019-01/20190102_001/'
df = pd.DataFrame()
L = []
for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        print(filename)
        with open(os.path.join(directory, filename), encoding='utf8') as f:
            file = xmltodict.parse(f.read())['TED_EXPORT']
            if file['CODED_DATA_SECTION']['CODIF_DATA']['NC_CONTRACT_NATURE']['#text'] == 'Services' and file['CODED_DATA_SECTION']['CODIF_DATA']['TD_DOCUMENT_TYPE']['#text'] == 'Contract notice':
                print('-->> Services Contract notice')
                L.append(servicesContractNotice(file))
                df1 = pd.DataFrame(L).set_index('DOC_ID')
                df = df.append(df1, sort=True)
            else:
                pass

000001_2019.xml
-->> Services Contract notice
000002_2019.xml
000003_2019.xml
000004_2019.xml
000005_2019.xml
000006_2019.xml
000007_2019.xml
000008_2019.xml
000009_2019.xml
000010_2019.xml
000011_2019.xml
000012_2019.xml
000013_2019.xml
000014_2019.xml
000015_2019.xml
-->> Services Contract notice
000016_2019.xml
000017_2019.xml
000018_2019.xml
000019_2019.xml
000020_2019.xml
000021_2019.xml
-->> Services Contract notice


KeyError: 'CPV_ADDITIONAL'