In [2]:
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import pandas_dtype as dtype

import dask.bag as db

import json

# Reading in and exploring Digiwhist data (available for download from https://opentender.eu/at/download)
# Downloaded NDJSON version of All data, January 2023

In [154]:
#read all ndjson files to dask bag

bag = db.read_text('C:/Hertie/Thesis/data-all-ndjson/data-all-*.ndjson', blocksize="200MiB").map(json.loads)

In [202]:
bag.take(1)

({'created': '2020-09-18T07:45:00.725379',
  'modified': '2021-03-05T23:05:24.27887',
  'metaData': {'opentender': True},
  'persistentId': 'RO_45fb967156738c42b1adfc64e6f7fc6babdc7314ce8f86c630f1af7db2509b96_8368',
  'processingOrder': '2019-11-18 13:32:47.55390400',
  'title': 'TUB NEON FLUORESCENT',
  'cpvs': [{'code': '31532910-6', 'isMain': True}],
  'indicators': [{'type': 'INTEGRITY_SINGLE_BID',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'ADMINISTRATIVE_CENTRALIZED_PROCUREMENT',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'INTEGRITY_ADVERTISEMENT_PERIOD',
    'status': 'CALCULATED',
    'value': 100},
   {'type': 'INTEGRITY_DECISION_PERIOD', 'status': 'CALCULATED', 'value': 50},
   {'type': 'ADMINISTRATIVE_COVERED_BY_GPA', 'status': 'INSUFFICIENT_DATA'},
   {'type': 'ADMINISTRATIVE_ELECTRONIC_AUCTION',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'ADMINISTRATIVE_FRAMEWORK_AGREEMENT',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'INTEGRITY_NEW_COMPANY', 'status': '

In [248]:
bag.filter(lambda record: record['cpvs'][0]['code'].startswith("48")).take(1) 


({'created': '2020-09-18T04:11:34.408506',
  'modified': '2021-03-05T21:08:14.924809',
  'metaData': {'opentender': True},
  'persistentId': 'RO_d0818d42bfaa20a7902386f22a16f6a8b7dd9c203c0b5ea3e5f84df4d296716b_9916',
  'processingOrder': '2019-11-18 13:32:31.38997400',
  'title': 'SWITCH 8 PORTURI',
  'cpvs': [{'code': '48219500-1', 'isMain': True}],
  'indicators': [{'type': 'INTEGRITY_SINGLE_BID',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'ADMINISTRATIVE_CENTRALIZED_PROCUREMENT',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'INTEGRITY_ADVERTISEMENT_PERIOD',
    'status': 'CALCULATED',
    'value': 100},
   {'type': 'INTEGRITY_DECISION_PERIOD', 'status': 'CALCULATED', 'value': 50},
   {'type': 'ADMINISTRATIVE_COVERED_BY_GPA', 'status': 'INSUFFICIENT_DATA'},
   {'type': 'ADMINISTRATIVE_ELECTRONIC_AUCTION',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'ADMINISTRATIVE_FRAMEWORK_AGREEMENT',
    'status': 'INSUFFICIENT_DATA'},
   {'type': 'INTEGRITY_NEW_COMPANY', 'status': 'INS

In [155]:
#If using exeption handling, entire rows are passed when a single missing key is encountered.
#This is not the desired behaviour, so I will use the get method instead.

def flatten(record):
    return {
            'persistend_id' : record.get('persistentId', None),    
            'title': record.get('title', None),
            'title_english': record.get('titleEnglish', None),
            'description': record.get('description', None),
            'cpv' : [list.get("code", None) for list in record.get("cpvs", {})],
            'country' : record.get('country', None),
            'price' : record.get('digiwhistPrice', {}).get('netAmountEur', None),
            'cpv_2' : record.get('ot', {}).get('cpv', None),
            'date' : record.get('ot', {}).get('date', None),
            'procedure' : record.get('procedureType', None),
            'size' : record.get('size', None),
            'buyer_type' : [list.get("buyerType", None) for list in record.get("buyers", {})],                                  
            'buyer_activity' : [x for list in record.get("buyers", {}) for x in list.get("mainActivities", {})],
        }

DTYPES = {
     'persistend_id': dtype('O'),
     'title': dtype('string'),
     'title_english': dtype('string'),
     'description': dtype('string'),
     'cpv': dtype('O'),
     'country': dtype('string'),
     'price': dtype('float'),
     'cpv_2' : dtype('O'),
     'date' : dtype('datetime64[ns]'),
     'size' : dtype('O'),
     'procedure' : dtype('string'),
     'size' : dtype('string'),
     'buyer_type' : dtype('string'),
     'buyer_activity' : dtype('O') 
}  

# Filtering digiwhist data:
# 1. By column cpv_2
# 2. By column cpv

In [156]:
# 1. By column cpv_2

#filter rows that have None in cpv_2 column
#df_cpv_code = df_cpv_code[df_cpv_code['cpv_2'].notnull()]

df_1 = (bag.map(flatten)
         .filter(lambda x: x['cpv_2'] is not None)
         .filter(lambda x: x['cpv_2'].startswith('48') or x['cpv_2'].startswith('72'))
         .to_dataframe(meta=DTYPES)
         .compute()
          )


In [221]:
df_1.tail(10)

Unnamed: 0,persistend_id,title,title_english,description,cpv,country,price,cpv_2,date,size,procedure,buyer_type,buyer_activity
984,PL_560656f510e56287c76098ff594b4795ab0c1b4858f...,Dostawa i zainstalowanie sprzętu komputerowego,,Przedmiotem zamówienia jest dostawa i zainstal...,"[302620003, 302332202]",PL,42795.0,48822000,NaT,BELOW_THE_THRESHOLD,APPROACHING_BIDDERS,['OTHER'],[]
985,PL_3c40be92c7870f6fc13ba1cad63b6f1d59d6f9c41ad...,Dostawa sprzętu komputerowego i oprogramowania...,,Przedmiotem zamówienia jest dostawa sprzętu i ...,"[302620003, 302311008, 302410000]",PL,71865.0,48822000,NaT,BELOW_THE_THRESHOLD,APPROACHING_BIDDERS,['REGIONAL_AUTHORITY'],[]
986,HU_9bf803a6f9ceda65dada21f7e8bf0d643d2ebc14b6e...,Az MNB-ben jelenleg működő SAP R/3 4.6C verzió...,,,[722000007],HU,,72200000,NaT,,OTHER,['OTHER'],"[OTHER, ECONOMIC_AND_FINANCIAL_AFFAIRS, WATER]"
987,HU_d64989fe15ac6cb9d827a0130420f4e06e9c5219f77...,Levéltári dokumentumok (minisztertanácsi jegyz...,,,[72313000-2],HU,,72313000,NaT,,OTHER,['OTHER'],"[OTHER, WATER]"
988,PL_98644105c0eacd162932772cd194bf48fde35d4f7ba...,Dostawa licencji oprogramowania Systemu Biblio...,,Rozszerzenie licencji na potrzeby istniejącej ...,[481600007],PL,135736.0,48160000,NaT,BELOW_THE_THRESHOLD,NEGOTIATED_WITHOUT_PUBLICATION,['OTHER'],[]
989,PL_98b7705ffae33e4c886b1fe20e8f8559451fd32c1dc...,Postępowanie o udzielenie zamówienia publiczne...,,Dostawa i instalacja systemu wspomagania organ...,"[489000007, 453143200, 453153001, 480000008]",PL,155409.0,48900000,NaT,BELOW_THE_THRESHOLD,OPEN,['OTHER'],[]
990,PL_7cf28203824013eda502c62f77334d8ede73501c7fd...,Dostawa i montaż dwóch ekranów LED (Pawilon A-...,,dostawa i montaż dwóch ekranów LED (Pawilon A-...,[302543100],PL,,48813100,NaT,BELOW_THE_THRESHOLD,OPEN,['PUBLIC_BODY'],[]
991,PL_66518c26f5b0c019627f386aaa298c441a050eae8ec...,Zakup aktualizacji na 12 miesięcy 2009 roku na...,,Zakup aktualizacji na 12 miesięcy 2009 roku na...,[722680001],PL,126048.0,72268000,NaT,BELOW_THE_THRESHOLD,NEGOTIATED_WITHOUT_PUBLICATION,['REGIONAL_AUTHORITY'],[]
992,PL_732ecd399e28302d55b25c780a74e130c7c59a2058f...,"ZAKUP OPROGRAMOWANIA, AKCESORIÓW KOMPUTEROWYCH...",,"oprogramowanie, akceria komputerowe, zestawy k...","[487000000, 302131006, 302313100, 302321201, 3...",PL,44002.0,48700000,NaT,BELOW_THE_THRESHOLD,OPEN,['OTHER'],[]
993,PL_9a85a764caf3346b51c19e779861532d2acd4b8da17...,Rozszerzenie oprogramowania INFOMEDICA o kolej...,,Przedmiotem umowy jest: a) dostawa Oprogramowa...,[302494000],PL,131147.54,48442000,NaT,BELOW_THE_THRESHOLD,NEGOTIATED_WITHOUT_PUBLICATION,['PUBLIC_BODY'],[]


In [158]:
len(df_1)

158105

In [224]:
# 2. By column cpv
# @Note: Column cpv is a list of cpv codes. To filter on the list, I will create a list of all cpv codes that start with 48 or 72 and then filter on that list.

# read only value on the first position of data/cpv-codes.csv and save as list
cpv_codes = []
with open('../data/cpv-codes.csv', 'r') as f:
    next(f)
    for line in f:
        cpv = line.split(',')[0]
        cpv_codes.append(cpv)

# for cpv in cpv_codes: 
# remove dash and append to list

cpv_codes_nodash = [cpv.replace('-', '') for cpv in cpv_codes]

# for cpv code in cpv_codes:
# remove dash and everything after it and append to list

cpv_codes_short = [cpv.split('-')[0] for cpv in cpv_codes]

# append all lists to one list

cpv_codes_all = cpv_codes + cpv_codes_nodash + cpv_codes_short

# remove duplicates from cpv_codes_all

cpv_codes_all = list(dict.fromkeys(cpv_codes_all))

len(cpv_codes_all)

In [239]:
df_2 = (bag.map(flatten)
           .filter(lambda x: x['cpv'] is not None)
           .filter(lambda x: np.any(np.in1d(x["cpv"], cpv_codes)))
           .to_dataframe(meta=DTYPES)
           .compute()
        )

In [240]:
df_2.tail()

Unnamed: 0,persistend_id,title,title_english,description,cpv,country,price,cpv_2,date,size,procedure,buyer_type,buyer_activity
144,EU_d0bba8b9a09b01ff6c93d57a445c18149b4931e8a69...,,,,[72000000],FR,1662780.0,72000000,NaT,ABOVE_THE_THRESHOLD,OPEN,['NATIONAL_AUTHORITY'],[OTHER]
145,EU_a0609a26734d15f24b93d033a0d7c2a19446623ce07...,,,,[72000000],FR,2327413.0,72000000,NaT,ABOVE_THE_THRESHOLD,OPEN,['NATIONAL_AUTHORITY'],"[OTHER, GENERAL_PUBLIC_SERVICES]"
146,EU_a0609a26734d15f24b93d033a0d7c2a19446623ce07...,,,,"[72000000, 72521100]",ES,46599940.0,72000000,NaT,ABOVE_THE_THRESHOLD,OPEN,['REGIONAL_AUTHORITY'],"[HEALTH, GENERAL_PUBLIC_SERVICES]"
147,EU_a0609a26734d15f24b93d033a0d7c2a19446623ce07...,,,,"[30200000, 30241000, 50312000, 50312500, 72000...",FR,54300000.0,30200000,NaT,ABOVE_THE_THRESHOLD,OPEN,['NATIONAL_AUTHORITY'],[GENERAL_PUBLIC_SERVICES]
148,HU_d64989fe15ac6cb9d827a0130420f4e06e9c5219f77...,Levéltári dokumentumok (minisztertanácsi jegyz...,,,[72313000-2],HU,,72313000,NaT,,OTHER,['OTHER'],"[OTHER, WATER]"


In [241]:
# no of rows df_2
len(df_2)

107541

In [242]:
# concat rows from df_2 to df_1 and keep only unique rows based on persistent_id, title and description
df = pd.concat([df_1, df_2], ignore_index=True)
df.head()


Unnamed: 0,persistend_id,title,title_english,description,cpv,country,price,cpv_2,date,size,procedure,buyer_type,buyer_activity
0,RO_d0818d42bfaa20a7902386f22a16f6a8b7dd9c203c0...,SWITCH 8 PORTURI,,,[48219500-1],RO,218.24,48219500,2009-05-29,BELOW_THE_THRESHOLD,OUTRIGHT_AWARD,[None],[]
1,RO_beff4a52b9c1e030c5b893d11957ffe7e2414238658...,Achizitie soft,,,[48517000-5],RO,1019.2,48517000,2009-05-22,BELOW_THE_THRESHOLD,OUTRIGHT_AWARD,[None],[]
2,RO_37cdc8af7044078a9b7282f60df5baf17432c0c2fc1...,Bitdefender for File Server 2010,,,[48761000-0],RO,137.8,48761000,2009-11-23,BELOW_THE_THRESHOLD,OUTRIGHT_AWARD,['REGIONAL_AUTHORITY'],[EDUCATION]
3,RO_87fac2096a66de8fead815de15c90f3bbb9985753fb...,Office Standard 2007 Academic,,,[48900000-7],RO,,48900000,2009-11-25,,OUTRIGHT_AWARD,[None],[]
4,RO_9a2b37224f3370dcc972a4f2a969a2c9dfaa8686178...,Software antivirus Kaspersky Internet Security,,,[48761000-0],RO,,48761000,2009-10-22,,OUTRIGHT_AWARD,[None],[]


In [243]:
# drop duplicates based on persistent_id, title, description, country, price, cpv_2, date, size, procedure
df = df.drop_duplicates(subset=['persistend_id', 'title', 'description', 'country', 'price', 'cpv_2', 'date', 'size', 'procedure'], keep='first')

In [249]:
#save df_1 to csv in data folder as digiwhist_1.csv
df_1.to_csv('../data/digiwhist_1.csv', index=False)

#save df_2 to csv in data folder as digiwhist_2.csv
df_2.to_csv('../data/digiwhist_2.csv', index=False)

# save df to csv in data folder as digiwhist.csv
df.to_csv('../data/digiwhist.csv', index=False)

# Exploring Digiwhist df

In [250]:
# open digiwhist.csv and save as dataframe
df = pd.read_csv('../data/digiwhist.csv')

In [251]:
len(df)

168110

In [179]:
df.head()

Unnamed: 0,persistend_id,title,title_english,description,cpv,country,price,cpv_2,date,size,procedure,buyer_type,buyer_activity
0,RO_d0818d42bfaa20a7902386f22a16f6a8b7dd9c203c0...,SWITCH 8 PORTURI,,,['48219500-1'],RO,218.24,48219500.0,2009-05-29,BELOW_THE_THRESHOLD,OUTRIGHT_AWARD,[None],[]
1,RO_beff4a52b9c1e030c5b893d11957ffe7e2414238658...,Achizitie soft,,,['48517000-5'],RO,1019.2,48517000.0,2009-05-22,BELOW_THE_THRESHOLD,OUTRIGHT_AWARD,[None],[]
2,RO_37cdc8af7044078a9b7282f60df5baf17432c0c2fc1...,Bitdefender for File Server 2010,,,['48761000-0'],RO,137.8,48761000.0,2009-11-23,BELOW_THE_THRESHOLD,OUTRIGHT_AWARD,['REGIONAL_AUTHORITY'],['EDUCATION']
3,RO_87fac2096a66de8fead815de15c90f3bbb9985753fb...,Office Standard 2007 Academic,,,['48900000-7'],RO,,48900000.0,2009-11-25,,OUTRIGHT_AWARD,[None],[]
4,RO_9a2b37224f3370dcc972a4f2a969a2c9dfaa8686178...,Software antivirus Kaspersky Internet Security,,,['48761000-0'],RO,,48761000.0,2009-10-22,,OUTRIGHT_AWARD,[None],[]


In [181]:
#group by country and frequency
df.groupby('country').size().sort_values(ascending=False)

country
RO    67558
PL    17543
ES    15080
FR    13836
DE     5309
UK     3839
NO     3601
CZ     3521
NL     2563
HU     2528
IE     2523
AT     2313
LT     2191
IT     2127
EE     1771
HR     1677
SE     1637
BE     1467
LV     1464
CH     1307
PT     1182
GE     1170
FI     1136
SI     1042
BG     1038
SK     1000
DK      705
GR      609
LU      175
CY      106
MT       80
MK       73
IS       35
MD       12
LI        5
RS        2
GI        2
TR        2
MG        1
NP        1
KE        1
IN        1
SD        1
ET        1
GF        1
JP        1
dtype: int64

In [253]:
# table that shows a count of rows per country in rows and year from date in columns
df['year'] = pd.DatetimeIndex(df['date']).year
df.groupby(['country', 'year']).size().unstack(fill_value=0)

year,2009.0,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AT,34,22,29,15,27,25,33,30,30,37,276,464,730,507
BE,122,105,112,102,74,85,88,85,88,98,93,89,128,87
BG,72,51,41,48,61,59,55,43,65,63,78,94,140,96
CH,53,46,39,58,55,39,113,76,99,119,111,148,203,125
CY,11,8,3,6,7,2,4,5,5,8,9,6,10,8
CZ,226,225,187,334,398,293,285,184,128,205,213,189,196,121
DE,282,306,235,229,233,224,280,346,323,415,431,539,599,432
DK,43,63,47,46,57,50,61,39,44,41,52,49,42,24
EE,143,146,104,110,116,122,129,140,109,96,85,86,115,57
ES,458,450,193,280,320,357,473,624,899,1327,2270,2357,2872,1860


In [254]:
#change description column of df_cpv_code to lower case
df['description'] = df['description'].str.lower()

In [255]:
saas = df[df['description'].str.contains('saas') | df['description'].str.contains('software as a service')]

In [257]:
# table that shows a count of rows per country in rows and year from date in columns
saas.groupby(['country', 'year']).size().unstack(fill_value=0)

year,2009.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AT,0,0,0,0,0,0,0,0,0,0,2,1,1
BE,0,1,1,4,1,0,1,1,1,1,4,6,2
CH,0,0,0,0,1,3,0,1,2,1,4,5,0
CZ,0,0,0,0,1,0,0,0,0,2,0,1,0
DE,0,0,1,0,1,2,2,3,4,5,7,13,7
DK,0,0,0,1,0,0,3,4,2,2,0,1,3
EE,1,2,1,1,1,0,1,0,0,1,3,1,0
ES,0,0,0,0,1,0,2,0,0,1,2,2,4
FI,0,2,1,3,2,3,4,3,12,12,17,22,9
FR,0,5,5,4,6,9,5,13,15,19,15,13,17
