In [None]:
import os
import pandas as pd

# Import all datasets
features_df = pd.read_csv(os.path.join("..", "clean_data.csv"))
ghg_country_df = pd.read_csv(os.path.join("..", "data", "wdi_ghgs_co2e_unpivot.csv"))
# Clean unnecessary columns
features_df.drop(columns=["Unnamed: 0"], inplace=True)
ghg_country_df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
# We eliminate aggregations provided by the dataset
aggregations = [
    "Africa Eastern and Southern",
    "Africa Western and Central",
    "Arab World",
    "Caribbean small states",
    "Central Europe and the Baltics",
    "Early-demographic dividend",
    "East Asia & Pacific",
    "East Asia & Pacific (excluding high income)",
    "East Asia & Pacific (IDA & IBRD countries)",
    "Euro area",
    "Europe & Central Asia",
    "Europe & Central Asia (excluding high income)",
    "Europe & Central Asia (IDA & IBRD countries)",
    "European Union",
    "Fragile and conflict affected situations",
    "Heavily indebted poor countries (HIPC)",
    "High income",
    "IBRD only",
    "IDA & IBRD total",
    "IDA blend",
    "IDA only",
    "IDA total",
    "Late-demographic dividend",
    "Latin America & Caribbean",
    "Latin America & Caribbean (excluding high income)",
    "Latin America & the Caribbean (IDA & IBRD countries)",
    "Least developed countries: UN classification",
    "Low & middle income",
    "Low income",
    "Lower middle income",
    "Middle East & North Africa",
    "Middle East & North Africa (excluding high income)",
    "Middle East & North Africa (IDA & IBRD countries)",
    "Middle income",
    "North America",
    "Not classified",
    "OECD members",
    "Other small states",
    "Pacific island small states",
    "Post-demographic dividend",
    "Pre-demographic dividend",
    "Small states",
    "South Asia",
    "South Asia (IDA & IBRD)",
    "Sub-Saharan Africa",
    "Sub-Saharan Africa (excluding high income)",
    "Sub-Saharan Africa (IDA & IBRD countries)",
    "Upper middle income",
    "World",
]
# We eliminate these in both datasets
ghg_country_df = ghg_country_df[~ghg_country_df["Country Name"].isin(aggregations)]
features_df = features_df[~features_df["Country Name"].isin(aggregations)]

We first concentrate on creating the ideal GHG table for timeseries analysis (Part 1)

In [None]:
# We only want years 1990 - 2018 and total GHG emissions
ghg_country_df = ghg_country_df[
    ((ghg_country_df["Year"] > 1989) & (ghg_country_df["Year"] < 2019))
    & (
        (
            ghg_country_df["Indicator Name_x"]
            == "Total greenhouse gas emissions (kt of CO2 equivalent)"
        )
        | (ghg_country_df["Indicator Name_x"] == "Population, total")
    )
]
#
# We analyze null values
ghg_country_df[ghg_country_df.value.isna()].groupby(
    ["Year", "Indicator Name_x"]
).count()

In [None]:
# We observe that there are consistently at most 2 countries that don't have Population, total and at most 31 countries that don't have GHG total
# We analyze which countries are these and if they are important to our dataset
ghg_country_df[ghg_country_df.value.isna()].groupby(
    ["Country Name", "Indicator Name_x"]
).count()

In [None]:
# It is clear that most of the countries listed are small islands that we know are not the main focus of our analysis and that given their limited surface area cannot contribute in a significant way to GHG emissions compared to most other countries.
# having this in mind, we will eliminate all NAN rows.
country_names = ghg_country_df[ghg_country_df.value.isna()]["Country Name"].unique()
ghg_country_df = ghg_country_df[~ghg_country_df["Country Name"].isin(country_names)]
ghg_country_df.isna().sum()

In [None]:
# We can now create our two new variables: per capita and % of total
# Per-capita
ghg_country_df_pivot = ghg_country_df.pivot(
    index=["Country Name", "Year"], columns=["Indicator Name_x"], values="value"
)
# We want to know if there are countries that have one of the two features missing
ghg_country_df_pivot.isna().sum()
# We make the division
ghg_country_df_pivot[
    "Total greenhouse gas emissions per capita (kt of CO2 equivalent per person)"
] = (
    ghg_country_df_pivot["Total greenhouse gas emissions (kt of CO2 equivalent)"]
    / ghg_country_df_pivot["Population, total"]
)
ghg_country_df_pivot.reset_index(inplace=True)

In [None]:
#% of total
ghg_country_df_pivot[
    "Total greenhouse gas emissions as % of Total (kt of CO2 equivalent)"
] = (
    ghg_country_df_pivot["Total greenhouse gas emissions (kt of CO2 equivalent)"]
    / ghg_country_df_pivot[
        "Total greenhouse gas emissions (kt of CO2 equivalent)"
    ].sum()
) * 100
ghg_country_df_pivot.isna().sum()

In [None]:
# We add region and income level to have our ideal timeseries dataset for part 1.
ghg_country_timeseries_df = pd.merge(
    ghg_country_df_pivot,
    features_df[["Country Name", "Year", "Region", "Income Level"]],
    how="left",
    left_on=["Country Name", "Year"],
    right_on=["Country Name", "Year"],
)
# Check # rows didn't increase
print(ghg_country_df_pivot.shape)
print(ghg_country_timeseries_df.shape)
# We check that there are no nulls
ghg_country_timeseries_df.isna().sum()

We have the ideal data set for part 1

In [None]:
ghg_country_timeseries_df.to_csv(
    os.path.join("..", "data", "ghg_country_timeseries_df.csv")
)

We create the ideal dataset for part 2

In [None]:
# We are only interested in the year 2018
features_df = features_df[features_df["Year"] == 2018]
features_df.isna().sum()

Given that most of our analysis will be performed individually for each feature, we will not eliminate nulls at this point, every run will eliminate its respective nulls, this will allow us to have the most amount of data possible per run 

In [None]:
# We would like to add the variables of interest
ghg_country_2018_df = pd.merge(
    features_df,
    ghg_country_timeseries_df[
        [
            "Country Name",
            "Year",
            "Total greenhouse gas emissions (kt of CO2 equivalent)",
            "Total greenhouse gas emissions per capita (kt of CO2 equivalent per person)",
            "Total greenhouse gas emissions as % of Total (kt of CO2 equivalent)",
        ]
    ],
    how="left",
    left_on=["Country Name", "Year"],
    right_on=["Country Name", "Year"],
)
# Check # rows didn't increase
print(features_df.shape)
print(ghg_country_2018_df.shape)
# We check that there are no nulls
ghg_country_2018_df.isna().sum()

In [None]:
# There are some nulls for the response variable
ghg_country_2018_df[
    ghg_country_2018_df["Total greenhouse gas emissions (kt of CO2 equivalent)"].isna()
]["Country Name"]

Same as before, we observe that these nulls are related mostly with small countries or islands, whose emissions we think are insignificant and that we can eliminate from our analysis.

In [None]:
ghg_country_2018_df.dropna(
    how="any",
    subset=[
        "Total greenhouse gas emissions (kt of CO2 equivalent)",
        "Total greenhouse gas emissions per capita (kt of CO2 equivalent per person)",
        "Total greenhouse gas emissions as % of Total (kt of CO2 equivalent)",
    ],
    inplace=True,
)
ghg_country_2018_df.isna().sum()

We have the ideal data set for part 2


In [None]:
ghg_country_2018_df.to_csv(os.path.join("..", "data", "ghg_country_2018_df.csv"))