In [1]:
import pandas as pd
import numpy as np
from helper.missing_analysis import missing_values_table

# Missing data analysis.

This notebook explores missing data at the tender level.

In [2]:
# import cleaned data
df = pd.read_csv('../data/cleaned_recoded_tender_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,year,title,cpv,contract_type,description,country,buyer_name,buyer_address,buyer_town,contracting_authority_type,contracting_authority_activity,government_procurement_agreement,is_framework_agreement,is_dynamic,is_open,value_eur,saas
0,0,1-2021,2020,cyber threat intelligence infrastructure – des...,['72000000'],Services,enisa seeks to contract service providers for ...,Greece,European Union Agency for Cybersecurity,1 Vasilissis Sofias Street,Maroussi,Other,Other,No,Yes,No,Yes,3150000.0,No
1,1,1000-2015,2014,ds-rempart.,['72910000'],Services,maîtrise d'oeuvre du secours informatique de l...,France,Ministère de l'économie,Secrétariat général – SEP 1a – 18 avenue Léon ...,Paris Cedex 20,Ministry or any other national or federal auth...,General public services,Yes,No,No,Yes,2205140.0,No
2,2,100000-2015,2015,"service de téléphonie filaire, de télécommunic...","['64210000', '64212000', '64221000', '72400000']",Services,"services de téléphonie filaire, de télécommuni...",France,Communauté d'agglomération du Bassin d'Arcacho...,"2 allée d'Espagne, BP 147",Arcachon Cedex,Regional or local authority,Environment,Yes,Yes,No,Yes,,No
3,3,100004-2015,2015,,"['48810000', '48000000']",Services,les prestations objet de la consultation conce...,France,Cerema,Bât. 4 — 25 avenue François Mitterrand — CS 92...,Bron Cedex,Body governed by public law,Environment,Yes,No,No,Yes,450000.0,No
4,4,100004-2018,2018,framework agreement for microsoft licences.,['48000000'],Supplies,"sund, øygarden og fjell kommune hereby invites...",Norway,Sund kommune,Sund Senter,Skogsvåg,Regional or local authority,General public services,Yes,Yes,No,Yes,1227646.6,No


In [3]:
# save sum missing values of contracting authority type and activity as a variable

sum_missing_contracting_authority_type_pre = df["contracting_authority_type"].isnull().sum()
sum_missing_contracting_authority_activity_pre = df["contracting_authority_activity"].isnull().sum()


In [4]:
# create a nested dictionary called buyers from df with buyer_name as main key, and address = buyer_address, 
# town = buyer_town, type = contracting_authority_type as value, activity = contracting_authority_activity as subkeys.

print("Creating a dictionary of contracting authorities...")

buyers = {}
for index, row in df.iterrows():
    if row["buyer_name"] not in buyers:
        buyers[row["buyer_name"]] = {"address": row["buyer_address"], "town": row["buyer_town"], "type": row["contracting_authority_type"], "activity": row["contracting_authority_activity"]}
    else:
        continue

Creating a dictionary of contracting authorities...


In [5]:
# function to replace missing data for contracting authority.
# if contracting authority type is missing, check if buyer_name is in buyers dictionary. If it is, copy the
# value of "type" from the dictionary to the df. if contracting authority activity is missing, check if buyer_name is in buyers dictionary. If it is, copy the
# value of "activity" from the dictionary to the df.

def fill_missing_contracting_authority_values(row):

    if pd.isnull(row["contracting_authority_type"]):
        if row["buyer_name"] in buyers:
            row["contracting_authority_type"] = buyers[row["buyer_name"]]["type"]
        else:
            pass
    else:
        pass
    
    if pd.isnull(row["contracting_authority_activity"]):
        if row["buyer_name"] in buyers:
            row["contracting_authority_activity"] = buyers[row["buyer_name"]]["activity"]
        else:
            pass
    else:
        pass
    
    return row

print("Filling missing values in contracting authority type and activity...\n")

# apply function to df
df = df.apply(fill_missing_contracting_authority_values, axis=1)

Filling missing values in contracting authority type and activity...



In [6]:
sum_missing_contracting_authority_type_post = df["contracting_authority_type"].isnull().sum()
sum_missing_contracting_authority_activity_post = df["contracting_authority_activity"].isnull().sum()

In [7]:
# print difference in number of missing values before and after function

print("Values filled for contracting authority type: ", sum_missing_contracting_authority_type_pre - sum_missing_contracting_authority_type_post)
print("Values filled for contracting authority activity: ", sum_missing_contracting_authority_activity_pre - sum_missing_contracting_authority_activity_post)



Values filled for contracting authority type:  20598
Values filled for contracting authority activity:  21214


In [3]:
missing = missing_values_table(df)
missing

Your selected dataframe has 19 columns.
There are 12 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
value_eur,137125,46.5
contracting_authority_type,51306,17.4
is_framework_agreement,48702,16.5
is_dynamic,48702,16.5
is_open,46495,15.8
government_procurement_agreement,44262,15.0
contracting_authority_activity,31293,10.6
buyer_address,5796,2.0
title,1463,0.5
description,636,0.2


In [10]:
# print number of rows that have no missing in any of these columns: value_eur, is_open, government_procurement_agreement
# is_framework_agreement, is_dynamic, contracting_authority_type, contracting_authority_activity, 
# contract_type

print("Number of rows with no missing values in any of key columns: ", df[df["value_eur"].notnull() & df["is_open"].notnull() & df["government_procurement_agreement"].notnull() & df["is_framework_agreement"].notnull() & df["is_dynamic"].notnull() & df["contracting_authority_type"].notnull() & df["contracting_authority_activity"].notnull() & df["contract_type"].notnull()].shape[0])
print("Number of rows with no missing values in any of key columns, but tender value: ", df[df["is_open"].notnull() & df["government_procurement_agreement"].notnull() & df["is_framework_agreement"].notnull() & df["is_dynamic"].notnull() & df["contracting_authority_type"].notnull() & df["contracting_authority_activity"].notnull() & df["contract_type"].notnull()].shape[0])
print("Total number of rows: ", len(df))

Number of rows with no missing values in any of key columns:  133671
Number of rows with no missing values in any of key columns, but tender value:  210306
Total number of rows:  296894


# Missing data mechanism

Testing correlations between variables and missingness.

In [11]:
# create a new dataframe where for every column with missing values there is a variable with _missing extension,
# where value is 1 is value is missing, 0 if not.

df_missing = df.copy()

# removing unnecessary values

df_missing = df_missing[df_missing["country"].isin(df_missing["country"].value_counts()[df_missing["country"].value_counts() > 500].index)]
#remove year 2013
df_missing = df_missing[df_missing["year"] != 2013]

# from df_missing, drop id, title, cpv, contract_type, description, buyer_name, buyer_address, buyer_town,
# contracting_authority_type, contracting_authority_activity, is_open, is_dynamic, is_framework_agreement

for col in df_missing.columns:
    if df_missing[col].isnull().sum() > 0 and col not in ["title", "cpv", "description", "buyer_name", "buyer_address", "buyer_town"]:
        df_missing[col + "_missing"] = df_missing[col].isnull().astype(int)
    else:
        continue

df_missing = df_missing.drop(["id", "title", "cpv", "description", "buyer_name", "buyer_address", "buyer_town"], axis=1)


# create dummy variables from year, country, saas

df_missing = pd.get_dummies(df_missing, columns=["year", "country", "saas"])


### ADDED FOR SUPER LONG
##df_missing = pd.get_dummies(df_missing, columns=["contract_type", "contracting_authority_type", "contracting_authority_activity", "is_open", "is_dynamic", "is_framework_agreement", "government_procurement_agreement"])

In [12]:
# correlate all variables in df_missing that end with _missing with
# all columns that start with year, country, saas
# save in dataframe called "correlation_matrix"
# save correlation coefficients in column with subscript _r, p value in column with subscript _p

from scipy.stats import pearsonr

correlation_matrix = pd.DataFrame(columns=["contract_type_missing_r"])
for col in df_missing.columns:
    if col.endswith("_missing"):
        for col2 in df_missing.columns:
            if col2.startswith("year") or col2.startswith("country") or col2.startswith("saas"):
                
                p_value = pearsonr(df_missing[col2], df_missing[col])[1]         
               
                correlation_matrix.loc[col2, col + "_r"] = round(pearsonr(df_missing[col2], df_missing[col])[0], 2)
            else:
                continue
    else:
        continue

# make column in correlation_matrix with p value less than 0.05 bold

correlation_matrix


Unnamed: 0,contract_type_missing_r,contracting_authority_type_missing_r,contracting_authority_activity_missing_r,government_procurement_agreement_missing_r,is_framework_agreement_missing_r,is_dynamic_missing_r,is_open_missing_r,value_eur_missing_r
year_2014,0.02,-0.01,0.01,0.04,-0.06,-0.06,-0.03,0.03
year_2015,0.02,-0.0,0.02,0.05,-0.07,-0.07,-0.03,0.03
year_2016,0.01,-0.01,-0.0,0.02,-0.04,-0.04,-0.02,0.01
year_2017,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
year_2018,-0.0,0.01,0.0,0.0,0.03,0.03,0.02,0.02
year_2019,-0.01,0.01,-0.0,-0.02,0.04,0.04,0.02,-0.0
year_2020,-0.01,0.01,0.01,0.0,0.06,0.06,0.04,0.01
year_2021,-0.01,0.01,-0.0,-0.02,0.03,0.03,0.01,-0.02
year_2022,-0.01,-0.01,-0.03,-0.05,-0.0,-0.0,-0.02,-0.05
country_Austria,-0.0,0.02,-0.01,-0.02,-0.0,-0.0,-0.01,0.01


In [13]:
correlation_matrix.to_csv("../outputs/correlation_matrix_wmissing.csv")