## World Bank Project

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import libraries for running ANOVA and post-hoc test
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

#### Loading and Merging data

In [2]:
df1=pd.read_csv("world_bank_development_indicators.csv")
df2=pd.read_excel("income.xlsx")
df = pd.merge(df1, df2, left_on='country', right_on='Economy', how="inner")

#### Data Inspection

In [3]:
df.head()

Unnamed: 0,country,date,agricultural_land%,forest_land%,land_area,avg_precipitation,trade_in_services%,control_of_corruption_estimate,control_of_corruption_std,access_to_electricity%,...,population,rural_population,voice_and_accountability_estimate,voice_and_accountability_std,intentional_homicides,Economy,Code,Region,Income group,Lending category
0,Afghanistan,1960-01-01,,,,,,,,,...,8622466.0,7898093.0,,,,Afghanistan,AFG,"Middle East, North Africa, Afghanistan & Pakistan",Low income,IDA
1,Afghanistan,1961-01-01,57.878356,,652230.0,327.0,,,,,...,8790140.0,8026804.0,,,,Afghanistan,AFG,"Middle East, North Africa, Afghanistan & Pakistan",Low income,IDA
2,Afghanistan,1962-01-01,57.955016,,652230.0,327.0,,,,,...,8969047.0,8163985.0,,,,Afghanistan,AFG,"Middle East, North Africa, Afghanistan & Pakistan",Low income,IDA
3,Afghanistan,1963-01-01,58.031676,,652230.0,327.0,,,,,...,9157465.0,8308019.0,,,,Afghanistan,AFG,"Middle East, North Africa, Afghanistan & Pakistan",Low income,IDA
4,Afghanistan,1964-01-01,58.116002,,652230.0,327.0,,,,,...,9355514.0,8458694.0,,,,Afghanistan,AFG,"Middle East, North Africa, Afghanistan & Pakistan",Low income,IDA


In [4]:
df.sample(8)

Unnamed: 0,country,date,agricultural_land%,forest_land%,land_area,avg_precipitation,trade_in_services%,control_of_corruption_estimate,control_of_corruption_std,access_to_electricity%,...,population,rural_population,voice_and_accountability_estimate,voice_and_accountability_std,intentional_homicides,Economy,Code,Region,Income group,Lending category
689,Argentina,2009-01-01,45.430209,11.155958,2736690.0,591.0,6.931803,-0.445475,0.153026,97.792206,...,40684338.0,3787305.0,0.280091,0.113063,6.493899,Argentina,ARG,Latin America & Caribbean,Upper middle income,IBRD
11243,Norway,1974-01-01,2.466844,,365244.0,1414.0,,,,,...,3985258.0,1287119.0,,,,Norway,NOR,Europe & Central Asia,High income,
14241,Suriname,1997-01-01,0.564103,98.409603,156000.0,2331.0,,,,,...,452887.0,152972.0,,,,Suriname,SUR,Latin America & Caribbean,Upper middle income,Blend
5768,Grenada,1970-01-01,64.705882,,340.0,2350.0,,,,,...,98794.0,66970.0,,,,Grenada,GRD,Latin America & Caribbean,Upper middle income,Blend
5818,Grenada,2020-01-01,23.529412,52.058824,340.0,2350.0,57.403937,0.357018,0.204005,93.355568,...,123663.0,78480.0,0.687311,0.205946,11.32109,Grenada,GRD,Latin America & Caribbean,Upper middle income,Blend
13193,Slovenia,2004-01-01,24.428997,61.499503,20140.0,1162.0,17.656365,1.015796,0.147816,100.0,...,1997012.0,972385.0,1.112389,0.161861,1.352217,Slovenia,SVN,Europe & Central Asia,High income,
4856,European Union,2018-01-01,40.967514,39.750997,3996556.579,,26.037636,,,99.971837,...,447001100.0,113909579.0,,,,European Union,EUU,,,
1053,"Bahamas, The",1989-01-01,1.098901,,10010.0,1292.0,65.19595,,,,...,265493.0,55000.0,,,,"Bahamas, The",BHS,Latin America & Caribbean,High income,


In [5]:
# Information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16219 entries, 0 to 16218
Data columns (total 55 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   country                                    16219 non-null  object 
 1   date                                       16219 non-null  object 
 2   agricultural_land%                         13921 non-null  float64
 3   forest_land%                               7751 non-null   float64
 4   land_area                                  14126 non-null  float64
 5   avg_precipitation                          9726 non-null   float64
 6   trade_in_services%                         8776 non-null   float64
 7   control_of_corruption_estimate             4639 non-null   float64
 8   control_of_corruption_std                  4639 non-null   float64
 9   access_to_electricity%                     6989 non-null   float64
 10  renewvable_energy_cons

In [6]:
# Dataset Shape
print("Dataset Shape:", df.shape)

Dataset Shape: (16219, 55)


In [7]:
# columns in the dataset 
print("Columns in the dataset:")
for col in df.columns:
    print(col)

Columns in the dataset:
country
date
agricultural_land%
forest_land%
land_area
avg_precipitation
trade_in_services%
control_of_corruption_estimate
control_of_corruption_std
access_to_electricity%
renewvable_energy_consumption%
electric_power_consumption
CO2_emisions
other_greenhouse_emisions
population_density
inflation_annual%
real_interest_rate
risk_premium_on_lending
research_and_development_expenditure%
central_goverment_debt%
tax_revenue%
expense%
goverment_effectiveness_estimate
goverment_effectiveness_std
human_capital_index
doing_business
time_to_get_operation_license
statistical_performance_indicators
individuals_using_internet%
logistic_performance_index
military_expenditure%
GDP_current_US
political_stability_estimate
political_stability_std
rule_of_law_estimate
rule_of_law_std
regulatory_quality_estimate
regulatory_quality_std
government_expenditure_on_education%
government_health_expenditure%
multidimensional_poverty_headcount_ratio%
gini_index
birth_rate
death_rate
life_e

In [8]:
# percentage of missing values in each column
missing_percentage = df.isnull().mean() * 100
print("Percentage of missing values in each column:")
print(missing_percentage)

Percentage of missing values in each column:
country                                       0.000000
date                                          0.000000
agricultural_land%                           14.168568
forest_land%                                 52.210371
land_area                                    12.904618
avg_precipitation                            40.033294
trade_in_services%                           45.890622
control_of_corruption_estimate               71.397743
control_of_corruption_std                    71.397743
access_to_electricity%                       56.908564
renewvable_energy_consumption%               52.900919
electric_power_consumption                   54.904741
CO2_emisions                                 56.618780
other_greenhouse_emisions                    56.618780
population_density                           13.083421
inflation_annual%                            36.796350
real_interest_rate                           73.235095
risk_premium_on_lend

In [9]:
# Checking for missing values
df.isnull().sum()

country                                          0
date                                             0
agricultural_land%                            2298
forest_land%                                  8468
land_area                                     2093
avg_precipitation                             6493
trade_in_services%                            7443
control_of_corruption_estimate               11580
control_of_corruption_std                    11580
access_to_electricity%                        9230
renewvable_energy_consumption%                8580
electric_power_consumption                    8905
CO2_emisions                                  9183
other_greenhouse_emisions                     9183
population_density                            2122
inflation_annual%                             5968
real_interest_rate                           11878
risk_premium_on_lending                      13869
research_and_development_expenditure%        13447
central_goverment_debt%        

In [10]:
# decribe the dataset
df.describe()

Unnamed: 0,agricultural_land%,forest_land%,land_area,avg_precipitation,trade_in_services%,control_of_corruption_estimate,control_of_corruption_std,access_to_electricity%,renewvable_energy_consumption%,electric_power_consumption,...,multidimensional_poverty_headcount_ratio%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,voice_and_accountability_std,intentional_homicides
count,13921.0,7751.0,14126.0,9726.0,8776.0,4639.0,4639.0,6989.0,7639.0,7314.0,...,433.0,2075.0,15169.0,15151.0,15003.0,15783.0,15657.0,4706.0,4706.0,4103.0
mean,36.76978,32.727641,4927244.0,1201.597779,21.38132,-0.012059,0.203455,80.623744,31.097157,2980.947331,...,26.85581,37.786795,28.035763,10.488044,64.347022,212538600.0,121526700.0,-0.004632,0.16719,7.964641
std,20.80688,23.513946,14298030.0,801.490248,23.592171,1.00491,0.087821,28.955287,29.813651,4195.312605,...,10.706123,8.893234,12.883225,5.381069,11.157735,722747000.0,415324200.0,0.995756,0.064463,12.079899
min,0.262821,0.0,2.027,18.1,0.621025,-1.936706,0.114062,0.533899,0.0,5.548906,...,2.37,20.7,5.0,0.795,11.995,2646.0,0.0,-2.313395,0.102437,0.0
25%,19.797589,12.543726,20720.0,589.0,8.764372,-0.79194,0.148039,68.321813,5.305,380.483549,...,18.5,31.2,16.222842,7.0,56.9945,900595.5,417615.0,-0.823601,0.126576,1.323974
50%,37.470223,31.101989,199810.0,1083.0,13.68893,-0.243968,0.172068,98.556229,21.117526,1394.407738,...,24.8,35.6,26.9,9.20178,66.883,6250510.0,2919496.0,0.032819,0.1422,3.327091
75%,50.738916,47.498275,1246700.0,1738.0,25.388801,0.695926,0.226898,100.0,53.537515,4027.279627,...,32.6,43.3,39.576,12.4,72.633219,37067930.0,16499970.0,0.905235,0.196929,9.208205
max,93.44075,98.574551,129950700.0,3240.0,327.166162,2.459118,0.942502,100.0,98.34,54799.174708,...,74.2,65.8,58.121,103.534,85.497561,7950947000.0,3435260000.0,1.800992,0.578548,138.773986


In [11]:
# describe the dataset object type
df.describe(include=['object'])

Unnamed: 0,country,date,Economy,Code,Region,Income group,Lending category
count,16219,16219,16219,16219,13471,13343,8991
unique,256,64,256,256,7,4,3
top,Afghanistan,1992-01-01,Afghanistan,AFG,Europe & Central Asia,High income,IBRD
freq,64,256,64,64,3648,5376,4191


In [12]:
# unique and number of unique values in country and Income group column and years
print("Number of Unique countries:", df['country'].nunique())
print("Number of Unique Years:", df['date'].nunique())
print("Number of  Unique indicators:", df['Income group'].nunique())
print("Countries:", df['country'].unique())
print("Indicators:", df['Income group'].unique())
print("Years:", df['date'].unique())

Number of Unique countries: 256
Number of Unique Years: 64
Number of  Unique indicators: 4
Countries: ['Afghanistan' 'Africa Eastern and Southern' 'Africa Western and Central'
 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Arab World' 'Argentina' 'Armenia' 'Aruba'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas, The' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan'
 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
 'British Virgin Islands' 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso'
 'Burundi' 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Caribbean small states' 'Cayman Islands' 'Central African Republic'
 'Central Europe and the Baltics' 'Chad' 'Channel Islands' 'Chile' 'China'
 'Colombia' 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.' 'Costa Rica'
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Early-demographic dividend' 'East Asia & Pacific'
 'East Asia & Pacific

In [13]:
# duplicated rows in the dataset
df.duplicated().sum()
# There are no duplicated rows in the dataset

np.int64(0)

In [14]:
# number of entries for each income group
df['Income group'].value_counts()

Income group
High income            5376
Upper middle income    3392
Lower middle income    3039
Low income             1536
Name: count, dtype: int64

#### Data Cleaning 

In [15]:
def map_continent(region):
    if isinstance(region, str):
        if 'East Asia' in region or 'Pacific' in region or 'South Asia' in region \
           or 'Afghanistan & Pakistan' in region or 'Asia' in region:
            return 'Asia'
        elif 'Europe' in region:
            return 'Europe'
        elif 'Latin America' in region or 'Caribbean' in region or 'South America' in region:
            return 'South America'
        elif 'Africa' in region or 'Middle East' in region:
            return 'Africa'
        elif 'North America' in region:
            return 'North America'
    return 'Other'  # for NaN or unknown values

# Apply mapping
df['continent'] = df['Region'].apply(map_continent)

In [16]:
df['continent'].unique()

array(['Asia', 'Other', 'Africa', 'South America', 'North America'],
      dtype=object)

In [17]:
# countries names in country column edited to remove leading and trailing spaces
df['country'] = df['country'].str.strip()

In [18]:
df['country'].unique()

array(['Afghanistan', 'Africa Eastern and Southern',
       'Africa Western and Central', 'Albania', 'Algeria',
       'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda',
       'Arab World', 'Argentina', 'Armenia', 'Aruba', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana',
       'Brazil', 'British Virgin Islands', 'Brunei Darussalam',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia',
       'Cameroon', 'Canada', 'Caribbean small states', 'Cayman Islands',
       'Central African Republic', 'Central Europe and the Baltics',
       'Chad', 'Channel Islands', 'Chile', 'China', 'Colombia', 'Comoros',
       'Congo, Dem. Rep.', 'Congo, Rep.', 'Costa Rica', 'Croatia', 'Cuba',
       'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Early-demographic div

In [20]:
# rename dictionary for standardizing country names
rename_dict = {
    'Swaziland': 'Eswatini',
    'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
    'Congo, Rep.': 'Republic of the Congo',
    'Russian Federation': 'Russia',
    'Syrian Arab Republic': 'Syria',
    'Iran, Islamic Rep.': 'Iran',
    'Egypt, Arab Rep.': 'Egypt',
    'Bahamas, The': 'Bahamas',
    'Gambia, The': 'Gambia',
    'Venezuela, RB': 'Venezuela',
    "Korea, Dem. People's Rep.": 'North Korea',
    "Korea, Rep.": 'South Korea',
    "Hong Kong SAR, China": 'Hong Kong',
    "Sint Maarten (Dutch part)": 'Sint Maarten',
    "Virgin Islands (U.S.)": 'U.S. Virgin Islands'
}

# exclude keywords list for filtering out non-country entries
exclude_keywords = [
    'income', 'IDA', 'IBRD', 'small states', 'World', 'Euro area', 
    'OECD', 'demographic dividend', 'Arab World', 'Fragile', 'Post-', 
    'Pre-', 'Early-', 'Late-', 'Sub-Saharan Africa', 'East Asia & Pacific', 
    'Latin America & Caribbean', 'North America', 'Africa Eastern', 'Africa Western', 
    'Europe & Central Asia', 'Central Europe and the Baltics', 'European Union'
]

# non_countries list for direct exclusion
non_countries = [
    'European Union',
    'South Asia',
    'Heavily indebted poor countries (HIPC)',
    'Least developed countries: UN classification'
]

def clean_country(name):
    if not isinstance(name, str):
        return np.nan
    for word in exclude_keywords:
        if word.lower() in name.lower():
            return np.nan
    if name in non_countries:
        return np.nan
    return rename_dict.get(name, name)

df['country_clean'] = df['country'].apply(clean_country)

print(df['country_clean'].unique())



['Afghanistan' nan 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands'
 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde'
 'Cambodia' 'Cameroon' 'Canada' 'Cayman Islands'
 'Central African Republic' 'Chad' 'Channel Islands' 'Chile' 'China'
 'Colombia' 'Comoros' 'Democratic Republic of the Congo'
 'Republic of the Congo' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia'
 'Denmark' 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini'
 'Ethiopia' 'Faroe Islands' 'Fiji' 'Finland' 'France' 'French Polynesia'
 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Gibraltar' 'Greece'
 'Greenland' 'Grenada' 'Guam' 'Guatemala' 'Guinea' 'Guine

In [21]:
# rename the cleaned country column to country and drop the old country column as well as region column
df['country'] = df['country_clean']
df = df.drop(columns=['country_clean'])
df = df.drop(columns=['Region'])
df.sample(7)

Unnamed: 0,country,date,agricultural_land%,forest_land%,land_area,avg_precipitation,trade_in_services%,control_of_corruption_estimate,control_of_corruption_std,access_to_electricity%,...,population,rural_population,voice_and_accountability_estimate,voice_and_accountability_std,intentional_homicides,Economy,Code,Income group,Lending category,continent
1799,Botswana,1967-01-01,45.87899,,566730.0,416.0,,,,,...,578723.0,549069.0,,,,Botswana,BWA,Upper middle income,IBRD,Africa
3772,Dominica,2020-01-01,33.333333,63.826667,750.0,2083.0,38.784883,0.557267,0.226407,99.997215,...,71995.0,20814.0,0.873136,0.210281,20.834925,Dominica,DMA,Upper middle income,Blend,South America
7605,Jordan,2015-01-01,11.89795,1.09822,88780.0,111.0,28.243097,0.244418,0.13599,99.892761,...,9494246.0,925119.0,-0.769377,0.129909,1.590437,Jordan,JOR,Lower middle income,IBRD,Asia
15690,Vanuatu,1974-01-01,9.844135,,12190.0,2000.0,,,,,...,98790.0,85774.0,,,,Vanuatu,VUT,Lower middle income,IDA,Asia
1061,Bahamas,1997-01-01,0.999001,50.935065,10010.0,1292.0,38.356947,,,100.0,...,310171.0,57704.0,,,14.830553,"Bahamas, The",BHS,High income,,South America
6409,Hong Kong,1971-01-01,11.764706,,1020.0,,,,,,...,4045300.0,483858.0,,,,"Hong Kong SAR, China",HKG,High income,,Asia
15951,West Bank and Gaza,2012-01-01,66.551944,1.665449,6020.0,402.0,14.469814,-0.257588,0.257313,99.8,...,3979998.0,1010163.0,-0.93501,0.153488,0.69309,West Bank and Gaza,PSE,Lower middle income,,Asia


In [41]:
# Edit columns names 
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
#change % in column names to _percent
df.columns = [col.replace('%', '_percent') for col in df.columns]
df.columns

Index(['country', 'date', 'agricultural_land_percent', 'forest_land_percent',
       'avg_precipitation', 'trade_in_services_percent',
       'control_of_corruption_estimate', 'access_to_electricity_percent',
       'renewvable_energy_consumption_percent', 'co2_emisions',
       'inflation_annual_percent', 'real_interest_rate',
       'risk_premium_on_lending',
       'research_and_development_expenditure_percent',
       'central_goverment_debt_percent', 'tax_revenue_percent',
       'expense_percent', 'goverment_effectiveness_estimate',
       'human_capital_index', 'doing_business',
       'time_to_get_operation_license', 'statistical_performance_indicators',
       'individuals_using_internet_percent', 'logistic_performance_index',
       'military_expenditure_percent', 'gdp_current_us',
       'political_stability_estimate', 'rule_of_law_estimate',
       'regulatory_quality_estimate',
       'government_expenditure_on_education_percent',
       'government_health_expenditure_perc

In [28]:
# drop unnecessary columns for ANOVA test
# irrelevant columns and standard deviation columns
columns_to_drop = ['economy','code','lending_category']
std_cols = [c for c in df.columns if c.endswith('_std')]
# land_area, population_density, rural_population are irrelevant
# electric_power_consumption and other_greenhouse_emisions is raw emmisions and we have co2 emissions per capita
# birth_rate and death_rate are strongly correlated with life_expectancy
candidate_drop = [
    'rural_population','land_area','population_density',
    'birth_rate','death_rate',
    'electric_power_consumption','other_greenhouse_emisions'
]
df = df.drop(columns=columns_to_drop + std_cols + candidate_drop)


In [29]:
df.sample(5)

Unnamed: 0,country,date,agricultural_land%,forest_land%,avg_precipitation,trade_in_services%,control_of_corruption_estimate,access_to_electricity%,renewvable_energy_consumption%,co2_emisions,...,government_expenditure_on_education%,government_health_expenditure%,multidimensional_poverty_headcount_ratio%,gini_index,life_expectancy_at_birth,population,voice_and_accountability_estimate,intentional_homicides,income_group,continent
13371,South Africa,1990-01-01,78.807013,14.955271,495.0,5.75706,,,16.63,247614.7,...,4.69768,,,,63.375,39877570.0,,,Upper middle income,Africa
7263,Isle of Man,1993-01-01,71.578947,6.070175,,,,100.0,0.0,,...,,,,,72.31,70710.0,,,High income,Asia
2343,Cambodia,1999-01-01,26.285973,61.202011,1904.0,16.649675,,8.820037,81.53,1907.1,...,,,,,57.69,11899006.0,,4.882761,Lower middle income,Asia
3166,Democratic Republic of the Congo,1990-01-01,11.451005,66.442734,1543.0,,,,92.05,3179.0,...,,,,,48.601,35987541.0,,,Low income,Africa
4380,Eritrea,2021-01-01,62.722637,8.692108,,,-1.257142,52.51405,,,...,,,,,66.536,3620312.0,-1.93016,,Low income,Africa


In [30]:
df.columns

Index(['country', 'date', 'agricultural_land%', 'forest_land%',
       'avg_precipitation', 'trade_in_services%',
       'control_of_corruption_estimate', 'access_to_electricity%',
       'renewvable_energy_consumption%', 'co2_emisions', 'inflation_annual%',
       'real_interest_rate', 'risk_premium_on_lending',
       'research_and_development_expenditure%', 'central_goverment_debt%',
       'tax_revenue%', 'expense%', 'goverment_effectiveness_estimate',
       'human_capital_index', 'doing_business',
       'time_to_get_operation_license', 'statistical_performance_indicators',
       'individuals_using_internet%', 'logistic_performance_index',
       'military_expenditure%', 'gdp_current_us',
       'political_stability_estimate', 'rule_of_law_estimate',
       'regulatory_quality_estimate', 'government_expenditure_on_education%',
       'government_health_expenditure%',
       'multidimensional_poverty_headcount_ratio%', 'gini_index',
       'life_expectancy_at_birth', 'population'

In [42]:
df[['gdp_per_capita','co2_per_capita']] = df[['gdp_current_us','co2_emisions']].div(df['population'].replace(0, np.nan), axis=0)
# drop the raw  columns
df = df.drop(columns=['gdp_current_us','co2_emisions'])
df.head()

Unnamed: 0,country,date,agricultural_land_percent,forest_land_percent,avg_precipitation,trade_in_services_percent,control_of_corruption_estimate,access_to_electricity_percent,renewvable_energy_consumption_percent,inflation_annual_percent,...,multidimensional_poverty_headcount_ratio_percent,gini_index,life_expectancy_at_birth,population,voice_and_accountability_estimate,intentional_homicides,income_group,continent,gdp_per_capita,co2_per_capita
0,Afghanistan,1960-01-01,,,,,,,,,...,,,32.535,8622466.0,,,Low income,Asia,62.369375,
1,Afghanistan,1961-01-01,57.878356,,327.0,,,,,,...,,,33.068,8790140.0,,,Low income,Asia,62.443703,
2,Afghanistan,1962-01-01,57.955016,,327.0,,,,,,...,,,33.547,8969047.0,,,Low income,Asia,60.950364,
3,Afghanistan,1963-01-01,58.031676,,327.0,,,,,,...,,,34.016,9157465.0,,,Low income,Asia,82.021738,
4,Afghanistan,1964-01-01,58.116002,,327.0,,,,,,...,,,34.494,9355514.0,,,Low income,Asia,85.511073,


In [53]:
# check which years is suitable for Anova
df.groupby('date')[['gdp_per_capita','co2_per_capita']].apply(lambda x: x.isna().sum()).sort_values(by='gdp_per_capita').head(15)


Unnamed: 0_level_0,gdp_per_capita,co2_per_capita
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-01,9,29
2021-01-01,10,251
2014-01-01,10,29
2013-01-01,11,29
2012-01-01,11,29
2010-01-01,11,29
2019-01-01,12,29
2018-01-01,12,29
2017-01-01,12,29
2016-01-01,12,29
