In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/GDP per Capita in USD - GDP per capita.csv')
df

Unnamed: 0,REF_AREA,REF_AREA_LABEL,INDICATOR_LABEL,TIME_PERIOD,OBS_VALUE,COMMENT_TS,UNIT_MULT,UNIT_MULT_LABEL,DATA_SOURCE,DATA_SOURCE_LABEL,...,UNIT_TYPE_LABEL,TIME_FORMAT,TIME_FORMAT_LABEL,COMMENT_OBS,OBS_STATUS,OBS_STATUS_LABEL,OBS_CONF,OBS_CONF_LABEL,Column 1,Column 2
0,AFE,Africa Eastern and Southern,GDP per capita (current US$),1960,186.121835,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
1,AFW,Africa Western and Central,GDP per capita (current US$),1960,121.939925,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
2,CSS,Caribbean small states,GDP per capita (current US$),1960,455.490442,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
3,EAR,Early-demographic dividend,GDP per capita (current US$),1960,149.209963,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
4,EAS,East Asia & Pacific,GDP per capita (current US$),1960,150.820305,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14536,SOM,Somalia,GDP per capita (current US$),2024,636.983478,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
14537,MDG,Madagascar,GDP per capita (current US$),2024,544.997303,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
14538,CAF,Central African Republic,GDP per capita (current US$),2024,516.170424,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,
14539,MWI,Malawi,GDP per capita (current US$),2024,508.371273,GDP per capita (current US$),0,Units,WB_WDI,World Development Indicators (WDI),...,Currency,P1Y,Annual,,A,Normal value,PU,Public,,


### Normalization Step: Add ISO-alpha3 Code

In [None]:
# --- Normalization: Add ISO-alpha3 column ---

# 1. Load UNSD Methodology for mapping
try:
    unsd_df = pd.read_csv('data/UNSD — Methodology.csv', sep=';')
except FileNotFoundError:
    unsd_df = pd.read_csv('../data/UNSD — Methodology.csv', sep=';')

# 2. Create mappings
unsd_df['Country or Area'] = unsd_df['Country or Area'].astype(str).str.strip()
name_to_iso3 = unsd_df.set_index('Country or Area')['ISO-alpha3 Code'].to_dict()

m49_to_iso3 = {}
for idx, row in unsd_df.iterrows():
    try:
        # Handle potential non-numeric or NaN M49 codes
        m49_code = int(row['M49 Code'])
        m49_to_iso3[m49_code] = row['ISO-alpha3 Code']
    except (ValueError, TypeError):
        continue

# 3. Apply mapping
print("Applying ISO-alpha3 normalization...")
if 'REF_AREA' in df.columns:
    df['iso_alpha3'] = df['REF_AREA']
    print("  Column copied to 'iso_alpha3'.")
else:
    print("  Warning: Source column 'REF_AREA' not found.")

# Check results
missing_iso = df['iso_alpha3'].isna().sum()
if missing_iso > 0:
    print(f"  Warning: {missing_iso} rows have missing ISO-alpha3 codes.")
    print(df[df['iso_alpha3'].isna()][['REF_AREA']].head())


In [3]:
uniques = df.nunique()
cols_to_drop = uniques[uniques <= 1].index
df_filtered = df.drop(columns=cols_to_drop)
df = df_filtered
df

Unnamed: 0,REF_AREA,REF_AREA_LABEL,TIME_PERIOD,OBS_VALUE
0,AFE,Africa Eastern and Southern,1960,186.121835
1,AFW,Africa Western and Central,1960,121.939925
2,CSS,Caribbean small states,1960,455.490442
3,EAR,Early-demographic dividend,1960,149.209963
4,EAS,East Asia & Pacific,1960,150.820305
...,...,...,...,...
14536,SOM,Somalia,2024,636.983478
14537,MDG,Madagascar,2024,544.997303
14538,CAF,Central African Republic,2024,516.170424
14539,MWI,Malawi,2024,508.371273


In [4]:
column_to_check = 'TIME_PERIOD'
threshold_number = 2015
rows_to_keep_condition = df[column_to_check] >= threshold_number
df_result = df[rows_to_keep_condition].reset_index(drop=True)
df = df_result
df.rename(columns={'OBS_VALUE': 'GDP per capita (current US$)'}, inplace=True)
df

Unnamed: 0,REF_AREA,REF_AREA_LABEL,TIME_PERIOD,GDP per capita (current US$)
0,AFE,Africa Eastern and Southern,2015,1479.615260
1,AFW,Africa Western and Central,2015,1860.727694
2,ARB,Arab World,2015,6262.041685
3,CSS,Caribbean small states,2015,14402.472580
4,CEB,Central Electricity Board (CEB),2015,12596.947510
...,...,...,...,...
2535,SOM,Somalia,2024,636.983478
2536,MDG,Madagascar,2024,544.997303
2537,CAF,Central African Republic,2024,516.170424
2538,MWI,Malawi,2024,508.371273


In [5]:
df.rename(columns={'REF_AREA': 'Country Code'}, inplace=True)
df.rename(columns={'REF_AREA_LABEL': 'Country Name'}, inplace=True)
df.rename(columns={'TIME_PERIOD': 'Year'}, inplace=True)
df

Unnamed: 0,Country Code,Country Name,Year,GDP per capita (current US$)
0,AFE,Africa Eastern and Southern,2015,1479.615260
1,AFW,Africa Western and Central,2015,1860.727694
2,ARB,Arab World,2015,6262.041685
3,CSS,Caribbean small states,2015,14402.472580
4,CEB,Central Electricity Board (CEB),2015,12596.947510
...,...,...,...,...
2535,SOM,Somalia,2024,636.983478
2536,MDG,Madagascar,2024,544.997303
2537,CAF,Central African Republic,2024,516.170424
2538,MWI,Malawi,2024,508.371273
