# **Title**

## **Overview**

## **Data Description**

## **Initial Assumptions and Predictions About the Data**

## **Exploratory Data Analysis**

### **Data Cleaning**

In [256]:
# importing the relevant libraries

# cleaning libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from thefuzz import process
import pycountry_convert as pc

# visualization libraries
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from tabulate import tabulate

# for clustering

# for hypothesis test

In [258]:
# loading in the datasets

# defining names of csv files with similar structures
data_names = ['inflation', 'gdp', 'gdp_growth', 'exports', 'imports',
              'current_account', 'population', 'unemployment', 'fdi']

# for loop to assign variables for each file and creating copy
for name in data_names:
    globals()[name] = pd.read_csv(f"{name}.csv", skiprows=4) 
    globals()[f"{name}_og"] = globals()[name].copy()

# loading hdi file and making copy
hdi = pd.read_csv("hdi.csv")
hdi_og = hdi.copy()

# loading covid file and making copy
covid = pd.read_csv("covid.csv")
covid_og = covid.copy()

TypeError: read_csv() got an unexpected keyword argument 'float_format'

### **a) Economic Performance Indicators**

In [206]:
for name in data_names:
    indicator = globals()[name]  # Get the DataFrame by its variable name

    # Step 1: Remove the last column
    indicator = indicator.iloc[:, :-1]

    # Step 2: Calculate the mean from column index 4 to all columns except the last four
    mean_pre_covid = indicator.iloc[:, 4:-4].mean(axis=1)

    # Step 3: Calculate the mean of the last four columns and drop rows with >2 nulls in those columns
    post_covid = indicator.iloc[:, -4:]
    mean_post_covid = post_covid.mean(axis=1)
    indicator = indicator[post_covid.isnull().sum(axis=1) <= 2]

    # Optionally, store the computed means in the DataFrame
    indicator[f"{name}_pre_covid"] = mean_pre_covid
    indicator[f"{name}_post_covid"] = mean_post_covid

    # Save back the modified DataFrame
    globals()[name] = indicator

In [207]:
econ_indicators = globals()[data_names[0]][["Country Code", "Country Name", data_names[0] + "_pre_covid", data_names[0] + "_post_covid"]]

for name in data_names[1:]:
    indicator = globals()[name][["Country Code", name + "_pre_covid", name + "_post_covid"]]
    
    # Merge on "Country Code"
    econ_indicators = pd.merge(econ_indicators, indicator, on="Country Code", how="inner")  # Use 'inner' to keep only common countries

econ_indicators

Unnamed: 0,Country Code,Country Name,inflation_pre_covid,inflation_post_covid,gdp_pre_covid,gdp_post_covid,gdp_growth_pre_covid,gdp_growth_post_covid,exports_pre_covid,exports_post_covid,imports_pre_covid,imports_post_covid,current_account_pre_covid,current_account_post_covid,population_pre_covid,population_post_covid,unemployment_pre_covid,unemployment_post_covid,fdi_pre_covid,fdi_post_covid
0,AGO,Angola,381.436505,20.756307,3.729440e+10,7.605777e+10,3.694286,-0.098247,52.223582,42.827060,37.812202,26.457762,0.096506,7.657086,1.371106e+07,35092124.00,16.578207,15.40700,3.969016,-4.804052
1,ALB,Albania,16.303718,3.786831,6.182332e+09,1.895947e+10,2.945241,3.604788,21.012692,32.383986,39.574366,43.191705,-6.214363,-5.822428,2.708297e+06,2793294.00,16.572724,10.85225,4.212986,7.059969
2,ARM,Armenia,140.826284,4.754400,6.053604e+09,1.752997e+10,3.492637,4.875000,28.263119,43.837123,49.939059,48.733906,-9.637468,-2.363960,2.946195e+06,2970975.00,12.612517,15.06700,4.484251,2.628302
3,AUS,Australia,4.751805,3.975482,4.514917e+11,1.576016e+12,3.440447,2.418989,16.695656,24.603561,17.598784,20.399912,-4.194437,1.642050,1.707379e+07,26002001.75,6.574552,4.74200,2.368805,2.355060
4,AUT,Austria,3.252999,5.127395,1.776161e+11,4.745809e+11,2.759937,0.700001,39.599335,57.260922,38.942242,55.783541,2.331303,1.390939,7.844922e+06,9011568.25,4.893724,5.47900,1.338260,1.198459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,VUT,Vanuatu,4.339635,4.783980,3.838185e+08,1.002087e+09,2.493616,-0.603947,43.232998,13.456422,53.576085,51.727416,-5.492992,-8.922381,1.566333e+05,309545.25,6.907483,4.75925,6.993214,2.623239
141,WSM,Samoa,6.385071,4.175391,3.304428e+08,8.709892e+08,2.196606,-1.727369,29.343690,20.475458,49.316866,53.027829,-4.430853,-6.728470,1.677424e+05,214411.75,5.613345,6.01800,0.870636,0.600371
142,ZAF,South Africa,7.903933,5.233836,1.551726e+11,3.866202e+11,2.951881,0.349020,24.518488,31.201372,22.558080,27.987995,-1.149598,0.914699,3.800774e+07,61913944.50,23.762862,32.14750,0.757135,3.442005
143,ZMB,Zambia,38.223546,14.907891,7.264963e+09,2.424398e+10,3.305469,3.507258,32.255681,44.994100,34.567751,33.244541,-6.831698,6.331545,8.633757e+06,19884976.25,11.970276,5.78300,3.521270,0.806051


### **b) Quality of Life Indicators**

In [210]:
set(econ_indicators["Country Code"]) - set(hdi["ISO3"])

{'MAC'}

In [212]:
def indic_mean_calculate(data, indicator):
    columns = [col for col in data.columns if col.startswith(indicator)]
    columns = [col for col in columns if int(col.split("(")[-1][:-1]) <= 2019]  # Only keep years ≤ 2019
    data[f"{indicator} (Mean)"] = data[columns].mean(axis=1)

# Define the indicators
indicators = [
    "Human Development Index",
    "Life Expectancy at Birth",
    "Expected Years of Schooling"
]

# Calculate mean for each indicator
for indicator in indicators:
    indic_mean_calculate(hdi, indicator)

hdi[["ISO3", "Human Development Index (Mean)", "Life Expectancy at Birth (Mean)", "Expected Years of Schooling (Mean)"]]

Unnamed: 0,ISO3,Human Development Index (Mean),Life Expectancy at Birth (Mean),Expected Years of Schooling (Mean)
0,AFG,0.389200,57.156018,6.968946
1,AGO,0.495476,51.710503,7.026754
2,ALB,0.711533,76.516130,12.182366
3,AND,0.851350,81.803716,11.616034
4,ARE,0.815067,76.298737,12.267581
...,...,...,...,...
190,WSM,0.698400,71.095580,12.144760
191,YEM,0.459467,63.952524,8.235236
192,ZAF,0.665167,60.173008,13.193520
193,ZMB,0.478000,52.072360,9.545210


In [213]:
hdi_indicators = hdi[["ISO3", "Human Development Index (Mean)", "Life Expectancy at Birth (Mean)", "Expected Years of Schooling (Mean)"]]
econ_indicators = pd.merge(econ_indicators, hdi_indicators, 
                     left_on="Country Code", right_on="ISO3", 
                     how="inner").drop(columns=["ISO3"])

In [215]:
econ_indicators

Unnamed: 0,Country Code,Country Name,inflation_pre_covid,inflation_post_covid,gdp_pre_covid,gdp_post_covid,gdp_growth_pre_covid,gdp_growth_post_covid,exports_pre_covid,exports_post_covid,...,current_account_post_covid,population_pre_covid,population_post_covid,unemployment_pre_covid,unemployment_post_covid,fdi_pre_covid,fdi_post_covid,Human Development Index (Mean),Life Expectancy at Birth (Mean),Expected Years of Schooling (Mean)
0,AGO,Angola,381.436505,20.756307,3.729440e+10,7.605777e+10,3.694286,-0.098247,52.223582,42.827060,...,7.657086,1.371106e+07,35092124.00,16.578207,15.40700,3.969016,-4.804052,0.495476,51.710503,7.026754
1,ALB,Albania,16.303718,3.786831,6.182332e+09,1.895947e+10,2.945241,3.604788,21.012692,32.383986,...,-5.822428,2.708297e+06,2793294.00,16.572724,10.85225,4.212986,7.059969,0.711533,76.516130,12.182366
2,ARM,Armenia,140.826284,4.754400,6.053604e+09,1.752997e+10,3.492637,4.875000,28.263119,43.837123,...,-2.363960,2.946195e+06,2970975.00,12.612517,15.06700,4.484251,2.628302,0.700867,71.666627,11.942683
3,AUS,Australia,4.751805,3.975482,4.514917e+11,1.576016e+12,3.440447,2.418989,16.695656,24.603561,...,1.642050,1.707379e+07,26002001.75,6.574552,4.74200,2.368805,2.355060,0.906767,80.526569,20.830761
4,AUT,Austria,3.252999,5.127395,1.776161e+11,4.745809e+11,2.759937,0.700001,39.599335,57.260922,...,1.390939,7.844922e+06,9011568.25,4.893724,5.47900,1.338260,1.198459,0.881100,79.060641,15.410155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,VUT,Vanuatu,4.339635,4.783980,3.838185e+08,1.002087e+09,2.493616,-0.603947,43.232998,13.456422,...,-8.922381,1.566333e+05,309545.25,6.907483,4.75925,6.993214,2.623239,0.593000,69.125666,10.857024
140,WSM,Samoa,6.385071,4.175391,3.304428e+08,8.709892e+08,2.196606,-1.727369,29.343690,20.475458,...,-6.728470,1.677424e+05,214411.75,5.613345,6.01800,0.870636,0.600371,0.698400,71.095580,12.144760
141,ZAF,South Africa,7.903933,5.233836,1.551726e+11,3.866202e+11,2.951881,0.349020,24.518488,31.201372,...,0.914699,3.800774e+07,61913944.50,23.762862,32.14750,0.757135,3.442005,0.665167,60.173008,13.193520
142,ZMB,Zambia,38.223546,14.907891,7.264963e+09,2.424398e+10,3.305469,3.507258,32.255681,44.994100,...,6.331545,8.633757e+06,19884976.25,11.970276,5.78300,3.521270,0.806051,0.478000,52.072360,9.545210


### **c) Covid Indicators**

In [220]:
unmatched_countries = set(econ_indicators["Country Name"]) - set(covid["Country/Other"])
unmatched_countries

{'Bahamas, The',
 'Brunei Darussalam',
 'Congo, Rep.',
 "Cote d'Ivoire",
 'Egypt, Arab Rep.',
 'Gambia, The',
 'Hong Kong SAR, China',
 'Korea, Rep.',
 'Kyrgyz Republic',
 'Russian Federation',
 'Slovak Republic',
 'Turkiye',
 'United Kingdom',
 'United States',
 'Viet Nam',
 'West Bank and Gaza'}

In [221]:
def match_countries(country, covid_country, threshold = 60):
    match, score = process.extractOne(country, covid_country)
    return match if score >= threshold else None 


econ_indicators["Matched Country"] = econ_indicators["Country Name"].apply(
    lambda x: match_countries(x, covid["Country/Other"].unique()))

In [222]:
matched_countries = econ_indicators[econ_indicators["Country Name"].isin(unmatched_countries)]
matched_countries[['Country Name', 'Matched Country']]

Unnamed: 0,Country Name,Matched Country
13,"Bahamas, The",Bahamas
19,Brunei Darussalam,Brunei
26,Cote d'Ivoire,Ivory Coast
28,"Congo, Rep.",Congo
40,"Egypt, Arab Rep.",Egypt
47,United Kingdom,Brunei
51,"Gambia, The",Gambia
55,"Hong Kong SAR, China",China
71,Kyrgyz Republic,Kyrgyzstan
73,"Korea, Rep.",S. Korea


uk, West Bank and Gaza palestine, usa, hong kong

In [227]:
country_dict = {'United Kingdom' : 'UK',
                'Hong Kong SAR, China' : 'Hong Kong',
                'West Bank and Gaza' : 'Palestine',
                'United States' : 'USA'}

econ_indicators.loc[econ_indicators['Country Name'].isin(country_dict.keys()), 'Matched Country'] = econ_indicators['Country Name'].map(country_dict)

In [228]:
covid = covid.iloc[:,:-1]

In [229]:
country_data = pd.merge(econ_indicators, covid, 
                     left_on="Matched Country", right_on="Country/Other", 
                     how="inner").drop(columns=["Matched Country", "Country/Other"])

### **Adding Continent Column**

In [234]:
# Function to get continent from ISO3 code
def find_continent(iso3):
    try:
        iso2 = pc.country_alpha3_to_country_alpha2(iso3)  # Convert ISO3 to ISO2
        continent = pc.country_alpha2_to_continent_code(iso2)  # Get continent
        return continent
    except KeyError:
        return None  # Handle missing cases

# Apply to DataFrame
country_data["Continent"] = country_data["Country Code"].apply(find_continent)

### **Dealing with Missing Data**

In [237]:
country_data.isnull().sum()

Country Code                          0
Country Name                          0
inflation_pre_covid                   0
inflation_post_covid                  0
gdp_pre_covid                         0
gdp_post_covid                        0
gdp_growth_pre_covid                  0
gdp_growth_post_covid                 0
exports_pre_covid                     0
exports_post_covid                    0
imports_pre_covid                     0
imports_post_covid                    0
current_account_pre_covid             0
current_account_post_covid            0
population_pre_covid                  0
population_post_covid                 0
unemployment_pre_covid                0
unemployment_post_covid               0
fdi_pre_covid                         0
fdi_post_covid                        0
Human Development Index (Mean)        0
Life Expectancy at Birth (Mean)       0
Expected Years of Schooling (Mean)    0
Total Cases                           0
Total Deaths                          0


In [238]:
country_data[country_data['Continent'].isnull()]

Unnamed: 0,Country Code,Country Name,inflation_pre_covid,inflation_post_covid,gdp_pre_covid,gdp_post_covid,gdp_growth_pre_covid,gdp_growth_post_covid,exports_pre_covid,exports_post_covid,...,Expected Years of Schooling (Mean),Total Cases,Total Deaths,Total Recovered,Active Cases,Tot Cases/ 1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop,Continent
128,TLS,Timor-Leste,4.843522,4.90477,725746131.4,2769005000.0,5.865969,-0.403297,7.58101,44.679726,...,12.184832,23460,138,23102.0,220.0,17131,101,278529.0,203391.0,


In [241]:
country_data.loc[:, "Continent"] = country_data.loc[:, "Continent"].fillna("AS")

In [243]:
continent_rates = country_data.groupby("Continent").agg({
    "Total Recovered": lambda x: (x / country_data.loc[x.index, "Total Cases"]).median(),
    "Active Cases": lambda x: (x / country_data.loc[x.index, "population_post_covid"]).median(),
    "Total Tests": lambda x: (x / country_data.loc[x.index, "population_post_covid"]).median()
}).rename(columns={"Total Recovered": "Recovery Rate", "Active Cases": "Active Case Rate", "Total Tests": "Test Rate"})

continent_rates

Unnamed: 0_level_0,Recovery Rate,Active Case Rate,Test Rate
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,0.978674,4.8e-05,0.064666
AS,0.983165,0.000305,0.83625
EU,0.990287,0.000339,2.180759
,0.9697,0.001849,0.538612
OC,0.984531,0.00074,1.193373
SA,0.970631,5.8e-05,0.556605


In [245]:
for col, rate_col, base_col in zip(["Total Recovered", "Active Cases", "Total Tests"], 
                                   ["Recovery Rate", "Active Case Rate", "Test Rate"], 
                                   ["Total Cases", "population_post_covid", "population_post_covid"]):
    country_data.loc[:, col] = country_data.loc[:, col].fillna(
        country_data["Continent"].map(continent_rates[rate_col]) * country_data[base_col])

country_data.loc[:, "Tests/ 1M pop"] = country_data.loc[:, "Tests/ 1M pop"].fillna((
    country_data["Total Tests"] / country_data["population_post_covid"]) * 1000000)

In [264]:
pd.set_option("display.float_format", lambda x: f"{x:.2e}" if abs(x) > 1e6 else f"{x:.2f}")
round(country_data.describe(),2)

Unnamed: 0,inflation_pre_covid,inflation_post_covid,gdp_pre_covid,gdp_post_covid,gdp_growth_pre_covid,gdp_growth_post_covid,exports_pre_covid,exports_post_covid,imports_pre_covid,imports_post_covid,...,Life Expectancy at Birth (Mean),Expected Years of Schooling (Mean),Total Cases,Total Deaths,Total Recovered,Active Cases,Tot Cases/ 1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop
count,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,...,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0
mean,25.77,10.61,202000000000.0,654000000000.0,11826.84,2.06,36.67,43.95,42.5,48.07,...,69.4,12.23,4550000.0,45393.42,4450000.0,48185.87,181900.85,1398.74,46200000.0,1880000.0
std,56.51,30.35,723000000000.0,2570000000000.0,141876.06,2.31,24.71,33.25,23.82,28.28,...,8.75,3.11,12100000.0,132024.57,11900000.0,169641.71,193280.12,1428.63,143000000.0,3260000.0
min,1.38,0.19,214000000.0,511000000.0,-1.07,-9.67,7.58,2.51,9.75,2.3,...,49.57,3.9,7762.0,13.0,1605.0,0.0,347.0,3.0,24976.0,5093.0
25%,4.32,3.35,6160000000.0,18300000000.0,2.81,1.14,21.56,23.56,26.84,28.91,...,63.83,10.25,163540.0,1452.75,134824.75,303.25,17282.5,184.0,1850000.0,199615.75
50%,7.28,4.84,20600000000.0,70700000000.0,3.7,2.06,29.86,36.49,36.41,41.38,...,71.66,12.49,770588.5,7604.0,757533.15,1486.5,114666.0,898.0,5290000.0,742477.5
75%,15.36,7.26,97700000000.0,367000000000.0,4.85,3.38,46.51,51.89,55.32,59.46,...,76.08,14.28,2980000.0,25843.0,2970000.0,12837.5,297308.25,2205.5,25200000.0,1910000.0
max,381.44,253.48,7470000000000.0,24700000000000.0,1700000.0,6.63,168.33,210.04,160.77,184.81,...,81.84,20.83,108000000.0,1170000.0,106000000.0,1060000.0,697127.0,6595.0,1190000000.0,23300000.0
