# Random Forest

In [1]:
# Retrieve variables from other jupyter notebook
%store -r happy_postg
%store -r X_train_scaled
%store -r X_test_scaled
%store -r y_train
%store -r y_test
%store -r X_headings
%store -r X_scaler


In [2]:
# Test to ensure variables are retrived
y_train

3      8
127    5
249    6
1      8
245    6
      ..
70     6
132    5
289    5
109    5
176    7
Name: score, Length: 226, dtype: int64

In [3]:
# Dependencies
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=770)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.7763157894736842

# Save the model

In [5]:
# Save the svc model 
import joblib
filename = 'Models/model2.sav'

# Print file name
joblib.dump(rf, filename)

['Models/model2.sav']

# Additional checks on feature importances (X variables)

In [6]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.01220875, 0.19694821, 0.19713367, 0.1791057 , 0.1539526 ,
       0.11005169, 0.15059938])

In [7]:
X_headings

Index(['year', 'logged_GDP_per_capita', 'support', 'life_exp', 'freedom',
       'generosity', 'corruption'],
      dtype='object')

In [8]:
sorted(zip(rf.feature_importances_,X_headings), reverse=True)

[(0.19713367470150567, 'support'),
 (0.19694820536075952, 'logged_GDP_per_capita'),
 (0.17910569576875424, 'life_exp'),
 (0.15395260209331676, 'freedom'),
 (0.15059937884658764, 'corruption'),
 (0.11005169490020998, 'generosity'),
 (0.012208748328866278, 'year')]

In [14]:
# Test saved model loads smoothly
loaded_model = joblib.load('Models/model2.sav')

# Predict based on model2
predictions = loaded_model.predict(X_test_scaled)

# Put y_actual, y_predicted into dataframe
y_test_all_m2 = pd.DataFrame({"y_actual": y_test, "y_predicted": predictions})
y_test_all_m2

Unnamed: 0,y_actual,y_predicted
112,5,5
63,6,6
269,5,5
39,7,7
276,5,5
...,...,...
214,6,7
187,7,6
296,4,4
46,7,6


In [15]:
y_test_all_m2 = y_test_all_m2.reset_index()

In [16]:
# Merge 'happy_postg' on index with 'y_test_all_m2' on column name 'index'. (join=outer)
happy_postg_final_m2 = pd.merge(happy_postg,y_test_all_m2, how="outer", left_index=True, right_on="index")
happy_postg_final_m2 = happy_postg_final_m2.dropna()
happy_postg_final_m2 = happy_postg_final_m2.sort_index()
happy_postg_final_m2

Unnamed: 0,year,country,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption,score,index,y_actual,y_predicted
0.0,2020,Gambia,7.321815,0.693169,55.012016,0.733163,0.343199,0.690718,5.0,112,5.0,5.0
1.0,2020,Serbia,9.680981,0.881476,68.210205,0.726496,-0.073676,0.843509,6.0,63,6.0,6.0
2.0,2021,Mali,7.744000,0.724000,51.969000,0.697000,-0.036000,0.827000,5.0,269,5.0,5.0
3.0,2020,Bahrain,10.676380,0.876342,68.500000,0.905856,0.133729,0.739347,7.0,39,7.0,7.0
4.0,2021,Namibia,9.161000,0.818000,56.799000,0.719000,-0.149000,0.847000,5.0,276,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
71.0,2021,South Korea,10.651000,0.799000,73.900000,0.672000,-0.083000,0.727000,6.0,214,6.0,7.0
72.0,2021,Brazil,9.577000,0.882000,66.601000,0.804000,-0.071000,0.756000,7.0,187,7.0,6.0
73.0,2021,Malawi,6.958000,0.537000,57.948000,0.780000,0.038000,0.729000,4.0,296,4.0,4.0
74.0,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.934300,7.0,46,7.0,6.0


In [17]:
# Rename column 'index' to 'id'
happy_postg_final_m2 = happy_postg_final_m2.rename({'index':'id'}, axis='columns')

# Reorder column names
happy_postg_final_m2 = happy_postg_final_m2[['id','year','country','logged_GDP_per_capita','support','life_exp','freedom','generosity','corruption', 'y_actual','y_predicted']]
happy_postg_final_m2

Unnamed: 0,id,year,country,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption,y_actual,y_predicted
0.0,112,2020,Gambia,7.321815,0.693169,55.012016,0.733163,0.343199,0.690718,5.0,5.0
1.0,63,2020,Serbia,9.680981,0.881476,68.210205,0.726496,-0.073676,0.843509,6.0,6.0
2.0,269,2021,Mali,7.744000,0.724000,51.969000,0.697000,-0.036000,0.827000,5.0,5.0
3.0,39,2020,Bahrain,10.676380,0.876342,68.500000,0.905856,0.133729,0.739347,7.0,7.0
4.0,276,2021,Namibia,9.161000,0.818000,56.799000,0.719000,-0.149000,0.847000,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...
71.0,214,2021,South Korea,10.651000,0.799000,73.900000,0.672000,-0.083000,0.727000,6.0,7.0
72.0,187,2021,Brazil,9.577000,0.882000,66.601000,0.804000,-0.071000,0.756000,7.0,6.0
73.0,296,2021,Malawi,6.958000,0.537000,57.948000,0.780000,0.038000,0.729000,4.0,4.0
74.0,46,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.934300,7.0,6.0


# Test for 1 set of X values


# Test for 1 X_variables entry to get 1 y prediction

# Index(['year', 'logged_GDP_per_capita', 'support', 'life_exp', 'freedom',
#        'generosity', 'corruption'],
#       dtype='object')

yr = input
GDP_per_cap = input
logged_GDP_per_cap = natural log of GDP_per_cap
support = input
life_exp = input
freedom = input
gen = input
corrupt = input

X_inp = [yr, logged_GDP_per_cap, support, life_exp, freedom, gen, corrupt]





X_input = [[ 0.        ,  0.16086818,  0.43462228,  0.24209131,  0.58697999,
         0.73436565,  0.71028917]]

predictions = loaded_model.predict(X_input)
predictions