In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [31]:
# Load the CO2 Emission Data 
df_EPI = pd.read_excel('data/EPI.xlsx')
df_EPI.head()

Unnamed: 0,Code,Country,EPI,Year
0,AGO,Angola,39.3,2006
1,ALB,Albania,68.9,2006
2,ARE,United Arab Em.,73.2,2006
3,ARG,Argentina,77.7,2006
4,ARM,Armenia,63.8,2006


In [32]:
special_value = df_EPI[df_EPI['EPI'] == '..' ].index

In [33]:
df_EPI.drop(special_value , inplace=True)

In [34]:
df_EPI

Unnamed: 0,Code,Country,EPI,Year
0,AGO,Angola,39.3,2006
1,ALB,Albania,68.9,2006
2,ARE,United Arab Em.,73.2,2006
3,ARG,Argentina,77.7,2006
4,ARM,Armenia,63.8,2006
...,...,...,...,...
2248,VUT,Vanuatu,44.55,2017
2249,VEN,Venezuela,63.89,2017
2250,VNM,Viet Nam,46.96,2017
2251,ZMB,Zambia,50.97,2017


In [36]:
# Drop Naan row 
df_EPI = df_EPI.dropna()

In [73]:
# Select only 2017
df_EPI = df_EPI[df_EPI['Year'].isin([2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017])]

In [149]:
df_EPI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1784 entries, 133 to 2252
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Code     1784 non-null   object
 1   Country  1784 non-null   object
 2   EPI      1784 non-null   object
 3   Year     1784 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 69.7+ KB


In [206]:
# Load the CO2 Emission Data 
df_CO2_Emission = pd.read_csv('data/co-emissions-per-capita.csv')
df_CO2_Emission.head()

Unnamed: 0,Entity,Code,Year,Per capita CO₂ emissions (tonnes per capita)
0,Afghanistan,AFG,1800,0.0
1,Afghanistan,AFG,1801,0.0
2,Afghanistan,AFG,1802,0.0
3,Afghanistan,AFG,1803,0.0
4,Afghanistan,AFG,1804,0.0


In [207]:
df_CO2_Emission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42844 entries, 0 to 42843
Data columns (total 4 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Entity                                        42844 non-null  object 
 1   Code                                          42626 non-null  object 
 2   Year                                          42844 non-null  int64  
 3   Per capita CO₂ emissions (tonnes per capita)  42844 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.3+ MB


In [210]:
df_CO2_Emission = df_CO2_Emission.rename(columns={"Entity": "Country"})

In [212]:
df_CO2_Emission = df_CO2_Emission.drop(['Country'], axis=1)
df_CO2_Emission

Unnamed: 0,Code,Year,Per capita CO₂ emissions (tonnes per capita)
0,AFG,1800,0.000000
1,AFG,1801,0.000000
2,AFG,1802,0.000000
3,AFG,1803,0.000000
4,AFG,1804,0.000000
...,...,...,...
42839,ZWE,2013,0.766298
42840,ZWE,2014,0.769958
42841,ZWE,2015,0.691341
42842,ZWE,2016,0.615011


In [213]:
# Select only 2017
df_CO2_Emission = df_CO2_Emission[df_CO2_Emission['Year'].isin([2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017])]
df_CO2_Emission

Unnamed: 0,Code,Year,Per capita CO₂ emissions (tonnes per capita)
207,AFG,2007,0.085222
208,AFG,2008,0.153819
209,AFG,2009,0.241402
210,AFG,2010,0.293448
211,AFG,2011,0.411531
...,...,...,...
42839,ZWE,2013,0.766298
42840,ZWE,2014,0.769958
42841,ZWE,2015,0.691341
42842,ZWE,2016,0.615011


In [203]:
df_CO2_Emission.to_csv('file_name3.csv', index=False)

In [214]:
# Merge the previous Dataframe and the Unemployment Table
result = pd.merge(df_EPI,
                 df_CO2_Emission,
                 on=['Code', 'Year'])
result.head(100)

Unnamed: 0,Code,Country,EPI,Year,Per capita CO₂ emissions (tonnes per capita)
0,AGO,Angola,39.4947,2007,1.189469
1,ALB,Albania,83.951,2007,1.297707
2,ARE,United Arab Emirates,63.9858,2007,22.060806
3,ARG,Argentina,81.7837,2007,4.359068
4,ARM,Armenia,77.7546,2007,1.741447
...,...,...,...,...,...
95,MWI,Malawi,59.9109,2007,0.062586
96,MYS,Malaysia,83.9784,2007,6.852580
97,NAM,Namibia,70.6238,2007,1.134477
98,NER,Niger,39.0544,2007,0.048874


In [215]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1777 entries, 0 to 1776
Data columns (total 5 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Code                                          1777 non-null   object 
 1   Country                                       1777 non-null   object 
 2   EPI                                           1777 non-null   object 
 3   Year                                          1777 non-null   int64  
 4   Per capita CO₂ emissions (tonnes per capita)  1777 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 83.3+ KB


In [148]:
result.to_csv('file_name.csv', index=False)

In [216]:
# Load Unemployment Data
df_Unemployment_Rate = pd.read_csv('data/Unemployment Rate.csv')
df_Unemployment_Rate.head()

Unnamed: 0,Country,Code,Unemployment Rate,Year
0,Aruba,ABW,,2007
1,Afghanistan,AFG,11.046,2007
2,Angola,AGO,3.637,2007
3,Albania,ALB,15.966,2007
4,Andorra,AND,,2007


In [217]:
# Drop Naan row 
df_Unemployment_Rate = df_Unemployment_Rate.dropna()

In [218]:
df_Unemployment_Rate.head()

Unnamed: 0,Country,Code,Unemployment Rate,Year
1,Afghanistan,AFG,11.046,2007
2,Angola,AGO,3.637,2007
3,Albania,ALB,15.966,2007
5,Arab World,ARB,9.788912,2007
6,United Arab Emirates,ARE,2.081,2007


In [219]:
# Drop Country Column
df_Unemployment_Rate = df_Unemployment_Rate.drop(['Country'], axis=1)
df_Unemployment_Rate

Unnamed: 0,Code,Unemployment Rate,Year
1,AFG,11.046000,2007
2,AGO,3.637000,2007
3,ALB,15.966000,2007
5,ARB,9.788912,2007
6,ARE,2.081000,2007
...,...,...,...
2898,WSM,8.443000,2017
2900,YEM,13.152000,2017
2901,ZAF,27.070999,2017
2902,ZMB,11.626000,2017


In [220]:
# Merge the previous Dataframe and the Unemployment Table
result1 = pd.merge(result,
                 df_Unemployment_Rate,
                 on=['Code', 'Year'])
result1.head()

Unnamed: 0,Code,Country,EPI,Year,Per capita CO₂ emissions (tonnes per capita),Unemployment Rate
0,AGO,Angola,39.4947,2007,1.189469,3.637
1,ALB,Albania,83.951,2007,1.297707,15.966
2,ARE,United Arab Emirates,63.9858,2007,22.060806,2.081
3,ARG,Argentina,81.7837,2007,4.359068,8.47
4,ARM,Armenia,77.7546,2007,1.741447,9.812


In [221]:
result1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1739 entries, 0 to 1738
Data columns (total 6 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Code                                          1739 non-null   object 
 1   Country                                       1739 non-null   object 
 2   EPI                                           1739 non-null   object 
 3   Year                                          1739 non-null   int64  
 4   Per capita CO₂ emissions (tonnes per capita)  1739 non-null   float64
 5   Unemployment Rate                             1739 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 95.1+ KB


In [146]:
result1.to_csv('file_name9.csv', index=False)

In [None]:
# Load Renewable Energy Data
df_RE = pd.read_csv('data/Renewable Energy.csv')
df_RE.head()

In [None]:
# Drop the Naan Column
df_RE = df_RE.dropna()

In [None]:
# Select only 2017
df_RE = df_RE.loc[df_RE['TIME'] == 2017]
df_RE

In [None]:
# Drop the Time Column
df_RE = df_RE.drop(['TIME'], axis=1)
df_RE

In [None]:
# Merge 
result2 = pd.merge(result1,
                 df_RE,
                 on='Code')
result2.head()

In [None]:
# Load the Fossil Fuel Subsidies Data
df_FF_Sub = pd.read_csv('data/fossil-fuel-subsidies.csv')
df_FF_Sub.head()

In [None]:
# Load only 2015
df_FF_Sub = df_FF_Sub.loc[df_FF_Sub['Year'] == 2015]
df_FF_Sub

In [None]:
df_FF_Sub.info()

In [None]:
# Drop Entity and Year Column
df_FF_Sub = df_FF_Sub.drop(['Entity', 'Year'], axis=1)
df_FF_Sub.head()

In [None]:
# Drop the NaaN row
df_FF_Sub = df_FF_Sub.dropna()

In [None]:
df_FF_Sub.info()

In [None]:
result3 = pd.merge(result2,
                 df_FF_Sub,
                 on='Code')
result3.head()

In [None]:
df_RE_Consumption = pd.read_csv('data/RE consumption.csv')
df_RE_Consumption.head()

In [None]:
df_RE_Consumption = df_RE_Consumption.dropna()

In [None]:
df_RE_Consumption.head()

In [None]:
df_RE_Consumption = df_RE_Consumption.drop(['Country', 'Year'], axis=1)

In [None]:
df_RE_Consumption.info()

In [None]:
result4 = pd.merge(result3,
                 df_RE_Consumption,
                 on='Code')
result4.head()

In [None]:
result4.info()

In [None]:
df_happy = pd.read_csv('data/Happiness Index_1.csv')
df_happy.head()

In [None]:
result5 = pd.merge(result4,
                 df_happy,
                 on='Country')
result5.head()

### Modelling Part

In [None]:
result5 = result5[['Code', 'Country', 'Year', 'Happiness.Score', 'Unemployment Rate %', 'EPI Score', 'Annual CO₂ emissions (tonnes)', 'Renewable Supply %', 'Fossil-fuel pre-tax subsidies ($USD)', 'Renewable Energy Consumption (% of Total)']]
result5.head()

In [None]:
result5.info()

In [None]:
X = result5[result5.columns[4:]] # Independent Variables
y = result5['Unemployment Rate %'] # Dependent Variables

In [None]:
for col in X.columns: 
    plt.figure()
    plt.scatter(X[col],y)
    plt.ylabel('Unemployment Rate %')
    plt.xlabel(col)
    plt.show();

In [None]:
pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

In [None]:
result6 = result5[['Code', 'Country', 'Year', 'Unemployment Rate %', 'Annual CO₂ emissions (tonnes)','Fossil-fuel pre-tax subsidies ($USD)', 'EPI Score', 'Renewable Supply %']]
result6.head()

In [None]:
X1 = result6[result6.columns[4:]] # Independent Variables
y1 = result6['Unemployment Rate %'] # Dependent Variables

In [None]:
X_withconstant = sm.add_constant(X1)

In [None]:
result6 = sm.OLS(y1,X_withconstant)

result6_2017 = result6.fit()

result6_2017.summary()

In [None]:
X = result5[['Unemployment Rate %', 'EPI Score', 'Renewable Supply %']] # Independent Variables
y = result5['Happiness.Score'] # Dependent Variables

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [None]:
from sklearn.linear_model import LinearRegression
linear_regression_model = LinearRegression()
# 2. Fit model 
linear_regression_model.fit(X_train, y_train)

# 3. Score on training data 
print(linear_regression_model.score(X_train, y_train))

# 4. Score on testing data
print(linear_regression_model.score(X_test, y_test))