In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

CO2_FILE = "Dataco2 emission.csv"
GDP_FILE = "gdp.csv"
HDI_FILE = "Human Development Index - Full.csv"

YEAR_START = 2010
YEAR_END   = 2019
YEARS = list(range(YEAR_START, YEAR_END + 1))

FIG_DIR = "figures"
os.makedirs(FIG_DIR, exist_ok=True)

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (7, 5)


Markdown explanation (new cell):
Imports and global configuration
In this cell, we import all required Python libraries:
os, numpy, and pandas for file handling and data manipulation
matplotlib and seaborn for plotting
pearsonr from scipy.stats for correlation analysis (used later)
We also:
Define file names for the CO₂, GDP, and HDI datasets
Set the analysis period to 2010–2019
Create a figures directory (if it does not exist) for saving plots
Configure seaborn/matplotlib style and default figure size

In [4]:
def safe_read_worldbank(path: str) -> pd.DataFrame:
    """Safe file reading for World Bank GDP file"""
    try:
        df = pd.read_csv(path, skiprows=4)
        if "Country Code" in df.columns:
            return df
        return pd.read_csv(path)
    except Exception:
        return pd.read_csv(path)

def pearson_test(df, x, y, label):
    """p-value ,  pearson technique"""
    sub = df[[x, y]].dropna()
    if len(sub) < 5:
        print(f"{label}: not enough observation (n={len(sub)})")
        return np.nan, np.nan
    r, p = pearsonr(sub[x], sub[y])
    print(f"{label}: r = {r:.3f}, p-value = {p:.4g}, n = {len(sub)}")
    if p < 0.05:
        print("   --> Statistically meaningful (H0 is rejected)")
    else:
        print("   --> Not meaningful (H0 cannot be rejected)")
    return r, p


Markdown explanation:
Helper functions for reading and testing
safe_read_worldbank(path):
The World Bank GDP CSV files often have a 4-line header before the actual data.
This function:
First tries to read the file with skiprows=4
Checks if "Country Code" exists
Falls back to a normal read_csv if needed
This makes the GDP loading robust to small format differences.
pearson_test(df, x, y, label):
A small utility that:
Drops rows with missing values in x or y
Computes the Pearson correlation coefficient and p-value
Prints the result with a label and whether H₀ is rejected at the 5% level
Returns (r, p) for further use
In this notebook we focus mainly on data preparation, but this function is ready for later statistical analysis.

In [5]:
co2 = pd.read_csv(CO2_FILE)

# Keep only country-level rows
co2 = co2[co2["Description"] == "Country"].copy()

# Rename columns to consistent names
co2 = co2.rename(columns={
    "Name": "Country",
    "year": "Year",
    "co2": "CO2_total",
    "co2_per_capita": "CO2_per_capita",
    "co2_per_gdp": "CO2_per_GDP"
})

# Keep only relevant columns
co2 = co2[[
    "iso_code", "Country", "Year",
    "population", "CO2_total", "CO2_per_capita", "CO2_per_GDP"
]]

# Restrict to analysis years
co2 = co2[(co2["Year"] >= YEAR_START) & (co2["Year"] <= YEAR_END)]

co2.head()


Unnamed: 0,iso_code,Country,Year,population,CO2_total,CO2_per_capita,CO2_per_GDP
160,AFG,Afghanistan,2010,28284088.0,8.365,0.296,0.176
161,AFG,Afghanistan,2011,29347709.0,11.838,0.403,0.222
162,AFG,Afghanistan,2012,30560036.0,10.035,0.328,0.17
163,AFG,Afghanistan,2013,31622708.0,9.229,0.292,0.146
164,AFG,Afghanistan,2014,32792527.0,9.086,0.277,0.141


Markdown explanation:
CO₂ data preparation
We read the raw CO₂ emissions dataset from Dataco2 emission.csv.
We keep only rows where Description == "Country" to exclude regional totals and other aggregates.
We rename key columns to more descriptive and consistent names:
Name → Country
year → Year
co2 → CO2_total (total emissions)
co2_per_capita → CO2_per_capita
co2_per_gdp → CO2_per_GDP
We then select only the necessary columns and filter the data to the target period 2010–2019.
The final co2 dataframe now contains country-year CO₂ information with population and per-capita/per-GDP variants

In [6]:
gdp_raw = safe_read_worldbank(GDP_FILE)

# Determine which year columns are actually present
year_cols = [str(y) for y in YEARS if str(y) in gdp_raw.columns]
if not year_cols:
    raise ValueError("gdp.csv ERROR.")

# Keep only ISO code and year columns
gdp = gdp_raw[["Country Code"] + year_cols].rename(
    columns={"Country Code": "iso_code"}
)

# Wide → long format: one row per (iso_code, Year)
gdp = gdp.melt(
    id_vars="iso_code",
    value_vars=year_cols,
    var_name="Year",
    value_name="GDP"
)

gdp["Year"] = gdp["Year"].astype(int)

gdp.head()


Unnamed: 0,iso_code,Year,GDP
0,ABW,2010,2453597000.0
1,AFE,2010,849409600000.0
2,AFG,2010,15856670000.0
3,AFW,2010,606280100000.0
4,AGO,2010,83799470000.0


Markdown explanation:
GDP data preparation
We load the GDP file gdp.csv using safe_read_worldbank, which handles the World Bank header structure.
We detect which year columns (2010–2019) are available in the file. If none exist, we raise an error.
We keep only two types of columns:
Country Code (renamed to iso_code)
Year columns (e.g., "2010", "2011", …)
We then reshape the dataset from wide to long format using melt, so that:
Each row corresponds to a specific pair (iso_code, Year)
The GDP value appears in a single column called "GDP"
Finally, we convert the "Year" column to integer for consistency.
The resulting gdp dataframe is now ready to be merged with the CO₂ data.

In [7]:
hdi_full = pd.read_csv(HDI_FILE)

rows = []
for _, r in hdi_full.iterrows():
    iso = r["ISO3"]
    cname = r["Country"]
    for y in YEARS:
        rows.append({
            "iso_code": iso,
            "Country_hdi": cname,
            "Year": y,
            "HDI": r.get(f"Human Development Index ({y})", np.nan),
            "LifeExpectancy": r.get(f"Life Expectancy at Birth ({y})", np.nan),
            "GII": r.get(f"Gender Inequality Index ({y})", np.nan),
        })

hdi = pd.DataFrame(rows)
hdi.head()


Unnamed: 0,iso_code,Country_hdi,Year,HDI,LifeExpectancy,GII
0,AFG,Afghanistan,2010,0.448,60.8508,0.753
1,AFG,Afghanistan,2011,0.456,61.4191,0.746
2,AFG,Afghanistan,2012,0.466,61.923,0.738
3,AFG,Afghanistan,2013,0.474,62.4167,0.728
4,AFG,Afghanistan,2014,0.479,62.5451,0.718


Markdown explanation:
HDI, Life Expectancy, and GII panel construction
The HDI file is structured with one row per country and separate columns for each year and indicator.
To make it compatible with our CO₂ and GDP data, we:
Iterate over each country (hdi_full.iterrows())
For each year from 2010 to 2019, we construct a dictionary with:
iso_code (ISO3 country code)
Country_hdi (country name from HDI file)
Year
HDI for that year
LifeExpectancy for that year
GII (Gender Inequality Index) for that year
All dictionaries are appended to a list and converted into a new dataframe hdi.
This dataframe is in long panel format, with one row per (iso_code, Year) and 3 key development indicators.

In [10]:
panel = co2.merge(
    gdp, on=["iso_code", "Year"], how="inner"
)

panel.head(), panel.shape


(  iso_code      Country  Year  population  CO2_total  CO2_per_capita  \
 0      AFG  Afghanistan  2010  28284088.0      8.365           0.296   
 1      AFG  Afghanistan  2011  29347709.0     11.838           0.403   
 2      AFG  Afghanistan  2012  30560036.0     10.035           0.328   
 3      AFG  Afghanistan  2013  31622708.0      9.229           0.292   
 4      AFG  Afghanistan  2014  32792527.0      9.086           0.277   
 
    CO2_per_GDP           GDP  
 0        0.176  1.585667e+10  
 1        0.222  1.780510e+10  
 2        0.170  1.990733e+10  
 3        0.146  2.014642e+10  
 4        0.141  2.049713e+10  ,
 (1940, 8))

Markdown explanation:
Merging CO₂ and GDP
We perform an inner join between the CO₂ dataset and the GDP dataset on:
iso_code
Year
This keeps only country-year pairs that appear in both datasets.
The resulting panel dataframe contains:
iso_code, Country, Year, population and CO₂ variables
GDP for each country-year combination
We quickly inspect the head and shape of the merged dataframe to ensure the merge worked as expected.

In [11]:
panel = panel.merge(
    hdi, on=["iso_code", "Year"], how="left"
)

# Prefer CO2 country name, but fill from HDI when missing
panel["Country"] = panel["Country"].fillna(panel["Country_hdi"])
panel = panel.drop(columns=["Country_hdi"])

panel.head(), panel.shape


(  iso_code      Country  Year  population  CO2_total  CO2_per_capita  \
 0      AFG  Afghanistan  2010  28284088.0      8.365           0.296   
 1      AFG  Afghanistan  2011  29347709.0     11.838           0.403   
 2      AFG  Afghanistan  2012  30560036.0     10.035           0.328   
 3      AFG  Afghanistan  2013  31622708.0      9.229           0.292   
 4      AFG  Afghanistan  2014  32792527.0      9.086           0.277   
 
    CO2_per_GDP           GDP    HDI  LifeExpectancy    GII  
 0        0.176  1.585667e+10  0.448         60.8508  0.753  
 1        0.222  1.780510e+10  0.456         61.4191  0.746  
 2        0.170  1.990733e+10  0.466         61.9230  0.738  
 3        0.146  2.014642e+10  0.474         62.4167  0.728  
 4        0.141  2.049713e+10  0.479         62.5451  0.718  ,
 (1940, 11))

Markdown explanation:
Adding HDI, Life Expectancy, and GII to the panel
We merge the hdi dataframe (HDI, Life Expectancy, GII) into the panel using a left join:
Keys: iso_code and Year
This keeps all rows from the existing panel and attaches HDI-related data where available.
In some cases, Country might be missing in the CO₂ data but present in the HDI file.
We therefore fill missing values in Country using Country_hdi.
After that, we drop Country_hdi to avoid duplication.
The updated panel now contains CO₂, GDP, HDI, Life Expectancy, and GII for each country-year pair as far as the data allows.

In [12]:
# Compute GDP per capita
panel["GDP_per_capita"] = panel["GDP"] / panel["population"]

# Drop rows where key variables are missing
panel = panel.dropna(
    subset=["CO2_total", "CO2_per_capita", "GDP", "HDI", "LifeExpectancy"]
)

print(f">> Panel data size: {panel.shape[0]} observation, {panel.shape[1]} variable")
panel.head()


>> Panel data size: 1871 observation, 12 variable


Unnamed: 0,iso_code,Country,Year,population,CO2_total,CO2_per_capita,CO2_per_GDP,GDP,HDI,LifeExpectancy,GII,GDP_per_capita
0,AFG,Afghanistan,2010,28284088.0,8.365,0.296,0.176,15856670000.0,0.448,60.8508,0.753,560.621525
1,AFG,Afghanistan,2011,29347709.0,11.838,0.403,0.222,17805100000.0,0.456,61.4191,0.746,606.694656
2,AFG,Afghanistan,2012,30560036.0,10.035,0.328,0.17,19907330000.0,0.466,61.923,0.738,651.417092
3,AFG,Afghanistan,2013,31622708.0,9.229,0.292,0.146,20146420000.0,0.474,62.4167,0.728,637.087019
4,AFG,Afghanistan,2014,32792527.0,9.086,0.277,0.141,20497130000.0,0.479,62.5451,0.718,625.054866


Markdown explanation:
Derived variable and final cleaning
We compute GDP_per_capita as:
GDP per capita
=
GDP
population
GDP per capita=
population
GDP
​

which makes income more comparable across countries of different sizes.
We then drop any rows that are missing critical variables:
CO2_total
CO2_per_capita
GDP
HDI
LifeExpectancy
This step ensures that the final dataset is ready for analysis without major gaps in the main indicators.
Finally, we print the number of observations and variables in the cleaned panel to understand the effective sample size.

In [13]:
panel.to_csv("master_cross_section.csv", index=False)
print("master_cross_section.csv has been saved.")


master_cross_section.csv has been saved.


Markdown explanation:
Saving the final master dataset
In this last step of the preparation notebook, we save the cleaned and merged panel to:
master_cross_section.csv
This file contains the 2010–2019 country-year panel dataset with:
CO₂ emissions (total, per capita, per GDP)
GDP and GDP per capita
HDI
Life Expectancy
GII
It will be used in the second notebook for:
Exploratory Data Analysis (EDA)
Visualizations
Correlation and hypothesis testing