In [69]:
import pandas as pd
import warnings
import os

warnings.filterwarnings('ignore')

In [29]:
HAPPINESS_PATH = '../../data/countries/raw/2018.csv'
ECONOMIC_PATH = '../../data/countries/raw/WorldBank.xlsx'

happiness_df = pd.read_csv(HAPPINESS_PATH)
economic_df = pd.read_excel(ECONOMIC_PATH)

In [30]:
economic_df.head()

Unnamed: 0,Country Name,Country Code,Region,IncomeGroup,Year,"Birth rate, crude (per 1,000 people)","Death rate, crude (per 1,000 people)",Electric power consumption (kWh per capita),GDP (USD),GDP per capita (USD),Individuals using the Internet (% of population),"Infant mortality rate (per 1,000 live births)",Life expectancy at birth (years),Population density (people per sq. km of land area),Unemployment (% of total labor force) (modeled ILO estimate)
0,Afghanistan,AFG,South Asia,Low income,2018,,,,19363000000.0,520.897,,47.9,,56.9378,1.542
1,Afghanistan,AFG,South Asia,Low income,2017,33.211,6.575,,20191800000.0,556.302,13.5,49.5,64.13,55.596,1.559
2,Afghanistan,AFG,South Asia,Low income,2016,33.981,6.742,,19362600000.0,547.228,11.2,51.2,63.763,54.1971,1.634
3,Afghanistan,AFG,South Asia,Low income,2015,34.809,6.929,,19907100000.0,578.466,8.26,53.1,63.377,52.7121,1.679
4,Afghanistan,AFG,South Asia,Low income,2014,35.706,7.141,,20484900000.0,613.856,7.0,55.1,62.966,51.1148,1.735


In [31]:
happiness_df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


### Things to do:
1. Remove unneccesary columns
2. Match shared columns names
3. Filter 2019 rows from the economic df
4. Remove unmatched rows on both data sets.
5. Round Score Column
6. Get A combined data set
7.  Make DS preproccesing for all 3 data sets

#### 1. Remove unneccesary columns

In [32]:
def remove_redundant_columns(happiness_df, economic_df):
    updated_happiness_df = happiness_df.drop(columns=['Overall rank'])
    updated_economic_df = economic_df.drop(columns=['Country Code'])
    return updated_happiness_df, updated_economic_df

In [33]:
happiness_df, economic_df = remove_redundant_columns(happiness_df = happiness_df, economic_df = economic_df)

#### 2. Match shared columns names

In [34]:
def match_sahred_columns_names(happiness_df, economic_df):
        updated_happiness_df = happiness_df.rename(columns={'Country or region': 'Country Name'})
        updated_economic_df = economic_df
        return updated_happiness_df, updated_economic_df



In [35]:
happiness_df, economic_df = match_sahred_columns_names(happiness_df = happiness_df, economic_df = economic_df)

#### 3. Filter 2019 rows from the economic df


In [36]:
def filter_2018_data_from_economic_df(economic_df):
    updated_economic_df = economic_df[economic_df['Year'] == 2018]
    return updated_economic_df

In [37]:
economic_df = filter_2018_data_from_economic_df(economic_df = economic_df)

In [38]:
economic_df

Unnamed: 0,Country Name,Region,IncomeGroup,Year,"Birth rate, crude (per 1,000 people)","Death rate, crude (per 1,000 people)",Electric power consumption (kWh per capita),GDP (USD),GDP per capita (USD),Individuals using the Internet (% of population),"Infant mortality rate (per 1,000 live births)",Life expectancy at birth (years),Population density (people per sq. km of land area),Unemployment (% of total labor force) (modeled ILO estimate)
0,Afghanistan,South Asia,Low income,2018,,,,1.936300e+10,520.897,,47.9,,56.9378,1.542
59,Albania,Europe & Central Asia,Upper middle income,2018,,,,1.505890e+10,5253.630,,7.8,,104.6120,13.898
118,Algeria,Middle East & North Africa,Upper middle income,2018,,,,1.810000e+11,4278.850,59.5797,20.1,,17.7301,12.145
177,American Samoa,East Asia & Pacific,Upper middle income,2018,,,,,,,,,277.3250,
236,Andorra,Europe & Central Asia,High income: nonOECD,2018,,,,3.236540e+09,42029.800,,2.7,,163.8430,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12154,Virgin Islands (U.S.),Latin America & Caribbean,High income: nonOECD,2018,,,,,,,,,305.6490,8.410
12213,West Bank and Gaza,Middle East & North Africa,Lower middle income,2018,,,,1.461590e+10,3198.870,64.4000,17.3,,758.9850,30.182
12272,"Yemen, Rep.",Middle East & North Africa,Lower middle income,2018,,,,2.691440e+10,944.408,,42.9,,53.9779,12.934
12331,Zambia,Sub-Saharan Africa,Lower middle income,2018,,,,2.672010e+10,1539.900,14.3000,40.4,,23.3415,7.209


#### 4. Remove unmatched rows on both data sets.

In [48]:
def remove_unmatched_rows(happiness_df, economic_df):
    merged_df = pd.merge(economic_df, happiness_df, on='Country Name', how='inner')

    updated_economic_df = merged_df[economic_df.columns]  # Keep only columns from economic_df
    updated_happiness_df = merged_df[happiness_df.columns]  # Keep only columns from happiness_df
    return updated_happiness_df, updated_economic_df

In [49]:
happiness_df, economic_df = remove_unmatched_rows(happiness_df=happiness_df, economic_df=economic_df)

#### 5. Round Score Column

In [53]:
def round_score_column(happiness_df):
    udpated_happiness_df = happiness_df.copy()
    udpated_happiness_df['Score'] = pd.to_numeric(udpated_happiness_df['Score'], errors='coerce').round().astype('Int64')
    return udpated_happiness_df

In [54]:
happiness_df = round_score_column(happiness_df=happiness_df)

#### 6. Get A combined data set


In [57]:
def combine_dfs(happiness_df, economic_df):
        merged_df = pd.merge(happiness_df, economic_df, on='Country Name', how="inner")
        return merged_df

In [65]:
combined_df = combine_dfs(happiness_df=happiness_df, economic_df=economic_df)

#### 7.  Make DS preproccesing for all 3 data sets

In [61]:
happiness_df.head()

Unnamed: 0,Country Name,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,Afghanistan,4,0.332,0.537,0.255,0.085,0.191,0.036
1,Albania,5,0.916,0.817,0.79,0.419,0.149,0.032
2,Algeria,5,0.979,1.154,0.687,0.077,0.055,0.135
3,Angola,4,0.73,1.125,0.269,0.0,0.079,0.061
4,Argentina,6,1.073,1.468,0.744,0.57,0.062,0.054


In [62]:
happiness_df.describe()

Unnamed: 0,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,136.0,136.0,136.0,136.0,136.0,136.0,135.0
mean,5.338235,0.890794,1.220154,0.596912,0.463904,0.182096,0.115852
std,1.175179,0.39846,0.304178,0.249221,0.163187,0.100333,0.099723
min,3.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.60175,1.0855,0.41375,0.366,0.11075,0.0515
50%,5.0,0.95,1.2735,0.6675,0.503,0.175,0.082
75%,6.0,1.19025,1.4665,0.782,0.58425,0.239,0.14
max,8.0,2.096,1.644,1.008,0.724,0.598,0.457


In [63]:
economic_df.head()

Unnamed: 0,Country Name,Region,IncomeGroup,Year,"Birth rate, crude (per 1,000 people)","Death rate, crude (per 1,000 people)",Electric power consumption (kWh per capita),GDP (USD),GDP per capita (USD),Individuals using the Internet (% of population),"Infant mortality rate (per 1,000 live births)",Life expectancy at birth (years),Population density (people per sq. km of land area),Unemployment (% of total labor force) (modeled ILO estimate)
0,Afghanistan,South Asia,Low income,2018,,,,19363000000.0,520.897,,47.9,,56.9378,1.542
1,Albania,Europe & Central Asia,Upper middle income,2018,,,,15058900000.0,5253.63,,7.8,,104.612,13.898
2,Algeria,Middle East & North Africa,Upper middle income,2018,,,,181000000000.0,4278.85,59.5797,20.1,,17.7301,12.145
3,Angola,Sub-Saharan Africa,Upper middle income,2018,,,,106000000000.0,3432.39,,51.6,,24.7131,7.253
4,Argentina,Latin America & Caribbean,High income: nonOECD,2018,,,,518000000000.0,11652.6,,8.8,,16.2585,9.483


In [64]:
def apply_ds_preproccesing(happiness_df, economic_df, combined_df):

    l_happiness_df = happiness_df.drop(columns=['Country Name'])
    l_economic_df = economic_df.drop(columns=['Country Name'])
    l_combined_df = combined_df.drop(columns=['Country Name'])

    l_economic_df = pd.get_dummies(l_economic_df, columns=['Region', 'IncomeGroup'])
    l_combined_df = pd.get_dummies(l_combined_df, columns=['Region', 'IncomeGroup'])

    #to do: see what about missing values

    return l_happiness_df, l_economic_df, l_combined_df


In [66]:
l_happiness_df, l_economic_df, l_combined_df = apply_ds_preproccesing(happiness_df=happiness_df, economic_df=economic_df, combined_df=combined_df)

#### Save As CSV

In [67]:
def write_dfs_as_csv(dfs):
    for filename, df in dfs.items():
        file_path = os.path.join('../../data/countries/processed', f"{filename}.csv")
        df.to_csv(file_path, index=False)

In [70]:
write_dfs_as_csv(dfs=
{
    'eda_happiness':happiness_df,
    'learning_happiness':l_happiness_df,
    'eda_economic':economic_df,
    'learning_economic':l_economic_df,
    'eda_combined_countries':combined_df,
    'learning_combined_countries':l_combined_df,
})