# Prepare Data

Import libraries

In [1]:
import pandas as pd


Load data

In [None]:
# load & preview gdp data
gdp = pd.read_csv("../data/processed/gdp_clean_final.csv")
gdp.head()

In [3]:
# load and preview country list data
country_list = pd.read_csv("../data/processed/country_list_final_clean.csv")
country_list.head()

Unnamed: 0,country,code
0,Afghanistan,AFG
1,Åland Islands,ALA
2,Albania,ALB
3,Algeria,DZA
4,American Samoa,ASM


In [4]:
# load and preview oil producing countriries list data
oil_states = pd.read_csv("../data/processed/oil_states_clean_final.csv")
oil_states.head()

Unnamed: 0,country,code
0,United States,USA
1,Saudi Arabia,SAU
2,Russia,RUS
3,Canada,CAN
4,Iraq,IRQ


Data merging and  filtering

In [None]:
# filter and drop out country that are not in the world country list
interim_df = pd.merge(country_list, gdp, on="code")

# preview the merged dataframe
display(interim_df.tail(15))

In [None]:
# check the list of countries with nan values
null_df = interim_df[interim_df.isna().any(axis=1)]

# drop the countries with nan values
interim_df_v1 = interim_df.dropna()

# preview the dataframe
display(interim_df_v1)


In [None]:
# check data information
interim_df_v1.info()

In [None]:
# drop country_x column and rename country_y to country
interim_df_v1.drop(columns=["country_x"], inplace=True)
interim_df_v1.rename(columns={"country_y": "country"}, inplace=True)
# preview the dataframe
display(interim_df_v1)

In [None]:
# column re-ordering (move country to the first column and code to the second column)
cols = list(interim_df_v1.columns)
cols[0], cols[1] = cols[1], cols[0]  # Swap the first and second column 
interim_df_v2 = interim_df_v1.reindex(columns=cols)
# preview the dataframe
display(interim_df_v2)

Data Grouping

In [None]:
# create a new column for oil producing countries
interim_df_v2["oil_producer"] = interim_df_v2["code"].isin(oil_states["code"])
# preview the dataframe
display(interim_df_v2)

In [None]:
# filter out the oil producing countries
interim_oil = interim_df_v2[interim_df_v2["oil_producer"] == True]
# preview the dataframe
display(interim_oil)

In [None]:
# filter out the non-oil producing countries
interim_non_oil = interim_df_v2[interim_df_v2["oil_producer"] == False]
# preview the dataframe
display(interim_non_oil)

Data Exporting

In [45]:
# save the non-oil producing countries dataframe to csv
interim_non_oil.to_csv("../data/final/non_oil_producing_countries.csv", index=False)
# save the oil producing countries dataframe to csv
interim_oil.to_csv("../data/final/oil_producing_countries.csv", index=False)
