# Preprocessing of TED CAN (contract award notices) CSV files

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.sql import HiveContext
import json
import time
import sys
from datetime import datetime
from pyspark.sql.types import *
import re

nPartitions=4
conf = (SparkConf()
         .setMaster("local["+str(nPartitions)+"]")
       )
#sc.stop()
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sqlContext.sql("SET spark.sql.parquet.binaryAsString=true")


DataFrame[key: string, value: string]

## Load country code file
Used to match country ISO codes to country full names

In [12]:
countryCodesRaw = [line.rstrip() for line in open('code/data/countrycodes.csv')]

In [13]:
countryCodes = {}
for i in range(len(countryCodesRaw)):
    code_country=countryCodesRaw[i].split(' - ')
    countryCodes[code_country[0]] = code_country[1]
countryCodes['']=''

## Load CPV meaning file

In [9]:
CPVmeaningRaw = [line.rstrip() for line in open('code/data/CPVmeaning.csv')]

In [31]:
CPVmeanings= {}
for i in range(len(CPVmeaningRaw)):
    CPV_meaning =CPVmeaningRaw[i].split(';')
    CPVmeanings[CPV_meaning[0][0:8]] = CPV_meaning[1]
CPVmeanings['']=''

## CSV row processor and schema
Convert CSV rows to nicer format and schema

In [138]:
def process(row):
    
    dispatch_date=datetime.strptime(row['DT_DISPATCH'], '%d-%b-%y').strftime('%Y-%m-%d')
    award_notice_id=row['ID_NOTICE_CAN'].strip()
    award_notice_id=award_notice_id[4:]+'-'+award_notice_id[0:4]
    award_notice_id_link="<a href='http://ted.europa.eu/udl?uri=TED:NOTICE:"+award_notice_id+":TEXT:EN:HTML' target='_blank'>"+award_notice_id+"</a>"
    
    contracting_authority_country=countryCodes[row['ISO_COUNTRY_CODE'].strip()]
    contracting_authority_name=row['CAE_NAME'].replace ("_", "-").strip()
    
    contractor_country=countryCodes[row['WIN_COUNTRY_CODE'].strip()]
    contractor_name=row['WIN_NAME'].replace ("_", "-").strip()
    
    contract_value_euros=None if row['AWARD_VALUE_EURO']=='' else int(float(row['AWARD_VALUE_EURO']))
    number_offers_received=None if row['NUMBER_OFFERS']=='' else int(row['NUMBER_OFFERS'])
    
    CPV_code=row['CPV'].strip()
    CPV_code_meaning=CPVmeanings[CPV_code]
    #CPV_code_meaning=''
    
    YEAR=None if row['YEAR']=='' else int(row['YEAR'])
    ID_TYPE=row['ID_TYPE'].strip()
    XSD_VERSION=row['XSD_VERSION'].strip()
    CANCELLED=row['CANCELLED'].strip()
    
    CAE_NATIONALID=row['CAE_NATIONALID'].strip()
    CAE_ADDRESS=row['CAE_ADDRESS'].strip()
    CAE_TOWN=row['CAE_TOWN'].strip()
    CAE_POSTAL_CODE=row['CAE_POSTAL_CODE'].strip()
    
    CAE_TYPE=row['CAE_TYPE'].strip()
    MAIN_ACTIVITY=row['MAIN_ACTIVITY'].strip()
    B_ON_BEHALF=row['B_ON_BEHALF'].strip()
    TYPE_OF_CONTRACT=row['TYPE_OF_CONTRACT'].strip()
    TAL_LOCATION_NUTS=row['TAL_LOCATION_NUTS'].strip()
    B_FRA_AGREEMENT=row['B_FRA_AGREEMENT'].strip()
    B_DYN_PURCH_SYST=row['B_DYN_PURCH_SYST'].strip()
    ADDITIONAL_CPVS=row['ADDITIONAL_CPVS'].strip()
    B_GPA=row['B_GPA'].strip()
    VALUE_EURO_FIN_1=None if row['VALUE_EURO_FIN_1']=='' else int(float(row['VALUE_EURO_FIN_1']))
    VALUE_EURO_FIN_2=None if row['VALUE_EURO_FIN_2']=='' else int(float(row['VALUE_EURO_FIN_2']))
    TOP_TYPE=row['TOP_TYPE'].strip()
    CRIT_CODE=row['CRIT_CODE'].strip()
    CRIT_CRITERIA=row['CRIT_CRITERIA'].strip()
    CRIT_WEIGHTS=row['CRIT_WEIGHTS'].strip()
    B_ELECTRONIC_AUCTION=row['B_ELECTRONIC_AUCTION'].strip()
    NUMBER_AWARDS=None if row['NUMBER_AWARDS']=='' else int(float(row['NUMBER_AWARDS']))
    
    WIN_ADDRESS=row['WIN_ADDRESS'].strip()
    WIN_TOWN=row['WIN_TOWN'].strip()
    WIN_POSTAL_CODE=row['WIN_POSTAL_CODE'].strip()
    
    ID_AWARD=row['ID_AWARD'].strip()
    CONTRACT_NUMBER=row['CONTRACT_NUMBER'].strip()
    LOT_NUMBER=row['LOT_NUMBER'].strip()
    TITLE=row['TITLE'].strip()
    NUMBER_OFFERS_ELECTR=None if row['NUMBER_OFFERS_ELECTR']=='' else int(float(row['NUMBER_OFFERS_ELECTR'])) 
    AWARD_EST_VALUE_EURO=None if row['AWARD_EST_VALUE_EURO']=='' else int(float(row['AWARD_EST_VALUE_EURO']))
    AWARD_VALUE_EURO=None if row['AWARD_VALUE_EURO']=='' else int(float(row['AWARD_VALUE_EURO']))
    VALUE_EURO_FIN_1_1=None if row['VALUE_EURO_FIN_1_1']=='' else int(float(row['VALUE_EURO_FIN_1_1']))
    B_SUBCONTRACTED=row['B_SUBCONTRACTED'].strip()
    B_EU_FUNDS=row['B_EU_FUNDS'].strip()
    DT_AWARD=None if row['DT_AWARD']=='' else datetime.strptime(row['DT_AWARD'].strip(), '%d-%b-%y').strftime('%Y-%m-%d')
    
    awardNoticeRow=[\
                    contracting_authority_country,\
                    contracting_authority_name,\
                    dispatch_date,\
                    CPV_code_meaning,\
                    contractor_country,\
                    contractor_name,\
                    contract_value_euros,\
                    number_offers_received,\
                    CPV_code,\
                    award_notice_id_link,\
                      
                    YEAR,\
                    ID_TYPE,\
                    XSD_VERSION,\
                    CANCELLED,\
                    
                    CAE_NATIONALID,\
                    CAE_ADDRESS,\
                    CAE_TOWN,\
                    CAE_POSTAL_CODE,\
                    
                    CAE_TYPE,\
                    MAIN_ACTIVITY,\
                    B_ON_BEHALF,\
                    TYPE_OF_CONTRACT,\
                    TAL_LOCATION_NUTS,\
                    B_FRA_AGREEMENT,\
                    B_DYN_PURCH_SYST,\
                    ADDITIONAL_CPVS,\
                    B_GPA,\
                    VALUE_EURO_FIN_1,\
                    VALUE_EURO_FIN_2,\
                    TOP_TYPE,\
                    CRIT_CODE,\
                    CRIT_CRITERIA,\
                    CRIT_WEIGHTS,\
                    B_ELECTRONIC_AUCTION,\
                    NUMBER_AWARDS,\
                    
                    WIN_ADDRESS,\
                    WIN_TOWN,\
                    WIN_POSTAL_CODE,\
                    
                    ID_AWARD,\
                    CONTRACT_NUMBER,\
                    LOT_NUMBER,\
                    TITLE,\
                    NUMBER_OFFERS_ELECTR,\
                    AWARD_EST_VALUE_EURO,\
                    AWARD_VALUE_EURO,\
                    VALUE_EURO_FIN_1_1,\
                    B_SUBCONTRACTED,\
                    B_EU_FUNDS,\
                    DT_AWARD\
                   
                   ]
    
    awardNoticeRow=[None if elt=="" else elt for elt in awardNoticeRow] 
    awardNoticeRow=tuple(awardNoticeRow)
    
    return awardNoticeRow

In [139]:
schema = StructType([
    StructField("Contracting_Authority_Country", StringType(), False),
    StructField("Contracting_Authority_Name", StringType(), False),
    StructField("Dispatch_Date", StringType(), False),
    StructField("CPV_Code_Meaning", StringType(), False),
    StructField("Contractor_Country", StringType(), False),
    StructField("Contractor_Name", StringType(), False),
    StructField("Contract_Value_Euros", IntegerType(), False),
    StructField("Number_Offers_Received", IntegerType(), False),
    StructField("CPV_Code", StringType(), False),
    StructField("Award_Notice_Id_Link", StringType(), False),
    
    StructField("YEAR", IntegerType(), False),
    StructField("ID_TYPE", StringType(), False),
    StructField("XSD_VERSION", StringType(), False),
    StructField("CANCELLED", StringType(), False),
    
    StructField("CAE_NATIONALID", StringType(), False),
    StructField("CAE_ADDRESS", StringType(), False),
    StructField("CAE_TOWN", StringType(), False),
    StructField("CAE_POSTAL_CODE", StringType(), False),
    
    StructField("CAE_TYPE", StringType(), False),
    StructField("MAIN_ACTIVITY", StringType(), False),
    StructField("B_ON_BEHALF", StringType(), False),
    StructField("TYPE_OF_CONTRACT", StringType(), False),
    StructField("TAL_LOCATION_NUTS", StringType(), False),
    StructField("B_FRA_AGREEMENT", StringType(), False),
    StructField("B_DYN_PURCH_SYST", StringType(), False),
    StructField("ADDITIONAL_CPVS", StringType(), False),
    StructField("B_GPA", StringType(), False),
    StructField("VALUE_EURO_FIN_1", IntegerType(), False),
    StructField("VALUE_EURO_FIN_2", IntegerType(), False),
    StructField("TOP_TYPE", StringType(), False),
    StructField("CRIT_CODE", StringType(), False),
    StructField("CRIT_CRITERIA", StringType(), False),
    StructField("CRIT_WEIGHTS", StringType(), False),
    StructField("B_ELECTRONIC_AUCTION", StringType(), False),
    StructField("NUMBER_AWARDS", IntegerType(), False),
    
    StructField("WIN_ADDRESS", StringType(), False),
    StructField("WIN_TOWN", StringType(), False),
    StructField("WIN_POSTAL_CODE", StringType(), False),
    
    StructField("ID_AWARD", StringType(), False),
    StructField("CONTRACT_NUMBER", StringType(), False),
    StructField("LOT_NUMBER", StringType(), False),
    StructField("TITLE", StringType(), False),
    StructField("NUMBER_OFFERS_ELECTR", IntegerType(), False),
    StructField("AWARD_EST_VALUE_EURO", IntegerType(), False),
    StructField("AWARD_VALUE_EURO", IntegerType(), False),
    StructField("VALUE_EURO_FIN_1_1", IntegerType(), False),
    StructField("B_SUBCONTRACTED", StringType(), False),
    StructField("B_EU_FUNDS", StringType(), False),
    StructField("DT_AWARD", StringType(), False)
    ])


## Load CSV data, convert, and save to parquet
CSV files are in ../data

Four files need to be processed:
* TED_CAN_2006.csv
* TED_CAN_2007.csv
* TED_CAN_2008.csv
* TED_CAN_2009_20015.csv

which can be downloaded from https://data.europa.eu/euodp/en/data/dataset/ted-csv


In [140]:
pathData='data/TED_CAN_2009_2015.csv'
#pathData='data/TED_CAN_2008.csv'


### Load

In [141]:
csvfile = sqlContext.read.format('com.databricks.spark.csv',).options(header='true', inferschema='true').load(pathData)#.repartition(2000)
csvfile.registerTempTable("csvData");


### Process

In [142]:
processedData=csvfile.map(process)
df=processedData.toDF(schema)

### Save

In [143]:
df.write.mode('append').parquet('ted.parquet')