# Random Forest (Model2)
    - Final Dataframe: happy_postg_final_m2

In [74]:
# Dependencies
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib # library used to save files
import json # convert to json format (preparation test code for app.py)

In [75]:
# Retrieve variables from other jupyter notebook
%store -r happy_postg
%store -r X_train_scaled
%store -r X_test_scaled
%store -r y_train
%store -r y_test
%store -r X_headings
%store -r X_scaler

%store -r X_train
%store -r X_test

In [53]:
# Test to ensure variables are retrived
y_train

3      8
127    5
249    6
1      8
245    6
      ..
70     6
132    5
289    5
109    5
176    7
Name: score, Length: 226, dtype: int64

- ## Model: variable 'rf'
- ## Accuracy of model 
- ## Save Model: model2.sav

In [4]:
# Create a random forest classifier (model)
rf = RandomForestClassifier(n_estimators=770)
rf = rf.fit(X_train_scaled, y_train)

# Accuracy of model
rf.score(X_test_scaled, y_test)

0.7631578947368421

In [49]:
# Save the Random Forest model 
filename = 'Models/model2.sav'

# Print file name
joblib.dump(rf, filename)

NameError: name 'rf' is not defined

## Feature importances (X variables) via Random Forest

In [54]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

NameError: name 'rf' is not defined

In [7]:
# Match importances with headings below (in that specific order)
X_headings

Index(['year', 'logged_GDP_per_capita', 'support', 'life_exp', 'freedom',
       'generosity', 'corruption'],
      dtype='object')

In [8]:
# Feature importance and X_headings in an array
sorted(zip(rf.feature_importances_,X_headings), reverse=True)

[(0.19987537671784644, 'support'),
 (0.19585595270283346, 'logged_GDP_per_capita'),
 (0.17834300178411563, 'life_exp'),
 (0.1533934899366418, 'freedom'),
 (0.14759642512932394, 'corruption'),
 (0.11252277651550291, 'generosity'),
 (0.01241297721373579, 'year')]

# **** ML_TABLE: X_VARIABLES, Y PREDICTED, Y ACTUAL SCORES ****

## Dataframe 'y_test_all_m2': contains y_actual and y_predicted scores

In [76]:
# Test saved model loads smoothly
loaded_model = joblib.load('Models/model2.sav')

# Predict based on model2
predictions = loaded_model.predict(X_test_scaled)

# Put y_actual, y_predicted into dataframe
y_test_all_m2 = pd.DataFrame({"y_actual": y_test, "y_predicted": predictions})
y_test_all_m2

Unnamed: 0,y_actual,y_predicted
112,5,5
63,6,6
269,5,5
39,7,7
276,5,5
...,...,...
214,6,7
187,7,6
296,4,4
46,7,6


In [51]:
# Reset index. (Next step after this to merge happy_postg to y_test_all_m2)
y_test_all_m2 = y_test_all_m2.reset_index()

## Merge dataframes:
    - final dataframe: happy_postg_final_m2
- happy_postg (X_variable values) with 
- y_test_all_m2 (y_actual values, y_predicted values)



In [11]:
# Merge 'happy_postg' on index with 'y_test_all_m2' on column name 'index'. (join=outer)
happy_postg_final_m2 = pd.merge(happy_postg,y_test_all_m2, how="outer", left_index=True, right_on="index")
# Clean up data
happy_postg_final_m2 = happy_postg_final_m2.dropna()
happy_postg_final_m2 = happy_postg_final_m2.sort_index()
happy_postg_final_m2

Unnamed: 0,year,country,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption,score,index,y_actual,y_predicted
0.0,2020,Gambia,7.321815,0.693169,55.012016,0.733163,0.343199,0.690718,5.0,112,5.0,5.0
1.0,2020,Serbia,9.680981,0.881476,68.210205,0.726496,-0.073676,0.843509,6.0,63,6.0,6.0
2.0,2021,Mali,7.744000,0.724000,51.969000,0.697000,-0.036000,0.827000,5.0,269,5.0,5.0
3.0,2020,Bahrain,10.676380,0.876342,68.500000,0.905856,0.133729,0.739347,7.0,39,7.0,7.0
4.0,2021,Namibia,9.161000,0.818000,56.799000,0.719000,-0.149000,0.847000,5.0,276,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
71.0,2021,South Korea,10.651000,0.799000,73.900000,0.672000,-0.083000,0.727000,6.0,214,6.0,7.0
72.0,2021,Brazil,9.577000,0.882000,66.601000,0.804000,-0.071000,0.756000,7.0,187,7.0,6.0
73.0,2021,Malawi,6.958000,0.537000,57.948000,0.780000,0.038000,0.729000,4.0,296,4.0,4.0
74.0,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.934300,7.0,46,7.0,6.0


In [13]:
## Further clean up of data ##

# Rename column 'index' to 'id'
happy_postg_final_m2 = happy_postg_final_m2.rename({'index':'id','logged_GDP_per_capita':'logged_gdp_per_capita'}, axis='columns')

# Reorder column names
happy_postg_final_m2 = happy_postg_final_m2[['id','year','country','logged_gdp_per_capita','support','life_exp','freedom','generosity','corruption', 'y_actual','y_predicted']]
happy_postg_final_m2

Unnamed: 0,id,year,country,logged_gdp_per_capita,support,life_exp,freedom,generosity,corruption,y_actual,y_predicted
0.0,112,2020,Gambia,7.321815,0.693169,55.012016,0.733163,0.343199,0.690718,5.0,5.0
1.0,63,2020,Serbia,9.680981,0.881476,68.210205,0.726496,-0.073676,0.843509,6.0,6.0
2.0,269,2021,Mali,7.744000,0.724000,51.969000,0.697000,-0.036000,0.827000,5.0,5.0
3.0,39,2020,Bahrain,10.676380,0.876342,68.500000,0.905856,0.133729,0.739347,7.0,7.0
4.0,276,2021,Namibia,9.161000,0.818000,56.799000,0.719000,-0.149000,0.847000,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...
71.0,214,2021,South Korea,10.651000,0.799000,73.900000,0.672000,-0.083000,0.727000,6.0,7.0
72.0,187,2021,Brazil,9.577000,0.882000,66.601000,0.804000,-0.071000,0.756000,7.0,6.0
73.0,296,2021,Malawi,6.958000,0.537000,57.948000,0.780000,0.038000,0.729000,4.0,4.0
74.0,46,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.934300,7.0,6.0


# LOAD to PostGRES

### Ensure pgAdmin is set up and create a table in database in
    - Develop environnment (PostGRES Database)
    - Deploy environment (Heroku PostGRES Database)



In [37]:
from sqlalchemy import create_engine

# Choose state = develop OR state = deploy
state = develop

if state == "develop":
    # Develop Environment - Connect to Local database 
    rds_connection_string = "zrhfplbtdfzlsb:7a52354918a924ca23e8e6f09d7e221ec33c71c911e49ac91fef71daac2db1dc@ec2-54-74-14-109.eu-west-1.compute.amazonaws.com:5432/dasrk3uqfvvv2e"

elif state == "deploy":
    # Deployment Environment - Connect to Heroku database 
    rds_connection_string = "zrhfplbtdfzlsb:7a52354918a924ca23e8e6f09d7e221ec33c71c911e49ac91fef71daac2db1dc@ec2-54-74-14-109.eu-west-1.compute.amazonaws.com:5432/dasrk3uqfvvv2e"

engine = create_engine(f'postgresql://{rds_connection_string}')

In [38]:
# Add table headings via pgAdmin as per schema

# Run code to ensure connection is established and table has been created on postgres.
engine.table_names()

['happy_table_m2']

In [35]:
# Load dataframe (variable 'happy_postg_final_m2') into postgres database 'the_flow_db', in the table 'happy_table'
happy_postg_final_m2.to_sql(name='happy_table_m2', con=engine, if_exists='append', index=False)

In [36]:
##### CHECK LOAD WAS SUCCESSFUL #####
# Run code to check connection is established and data is reading out from postgres database
m2_table = pd.read_sql_table('happy_table_m2', engine) 
m2_table

Unnamed: 0,id,year,country,logged_gdp_per_capita,support,life_exp,freedom,generosity,corruption,y_actual,y_predicted
0,112,2020,Gambia,7.321815,0.693169,55.012016,0.733163,0.343199,0.690718,5.0,5.0
1,63,2020,Serbia,9.680981,0.881476,68.210205,0.726496,-0.073676,0.843509,6.0,6.0
2,269,2021,Mali,7.744000,0.724000,51.969000,0.697000,-0.036000,0.827000,5.0,5.0
3,39,2020,Bahrain,10.676380,0.876342,68.500000,0.905856,0.133729,0.739347,7.0,7.0
4,276,2021,Namibia,9.161000,0.818000,56.799000,0.719000,-0.149000,0.847000,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...
71,214,2021,South Korea,10.651000,0.799000,73.900000,0.672000,-0.083000,0.727000,6.0,7.0
72,187,2021,Brazil,9.577000,0.882000,66.601000,0.804000,-0.071000,0.756000,7.0,6.0
73,296,2021,Malawi,6.958000,0.537000,57.948000,0.780000,0.038000,0.729000,4.0,4.0
74,46,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.934300,7.0,6.0


# For app.py: Code for Flask endpoints
- ## PART 1: ml_table
    - To ensure return of dataset is in json format
- ## PART 2: predictScore
    - Take user's input (set of X values)
    - X-scale the X_train with user input
    - Extract user input X scaled values
    - Prediction of Happiness Score using model

## PART 1: ml_table

In [1]:
# Convert to json format 
m2_table_json = json.dumps(json.loads(m2_table.to_json(orient = "records")), indent=4)
m2_table_json = json.loads(m2_table_json)
# m2_table_json

NameError: name 'json' is not defined

- ## PART 2: predictScore (Happiness Score Prediction) 
    - Take user's input (set of X values)
    - X-scale the X_train with user input
    - Extract user input X scaled values
    - Prediction of Happiness Score using model

- Test for 1 X_variables dataset entry to get 1 y prediction

yr = input
GDP_per_cap = input
logged_GDP_per_cap = natural log of GDP_per_cap
support = input
life_exp = input
freedom = input
gen = input
corrupt = input

X_inp = [yr, logged_GDP_per_cap, support, life_exp, freedom, gen, corrupt]





X_input = [[ 0.        ,  0.16086818,  0.43462228,  0.24209131,  0.58697999,
         0.73436565,  0.71028917]]

predictions = loaded_model.predict(X_input)
predictions

In [None]:
# Dependencies for app.py
import joblib

In [78]:
# Test saved model loads smoothly
loaded_model2 = joblib.load('Models/model2.sav')

## Prepare X_train dataset to be loaded to PostGRES. 
Purpose: To allow X scale to occur for the user input dataset
* app.py can then call for the X_train dataset via SQLAlchemy

In [59]:
%store -r X_train
X_train #226 entries in X_train

Unnamed: 0,year,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption
3,2020,10.772559,0.974670,73.000000,0.948892,0.246944,0.711710
127,2020,9.314973,0.688719,66.897858,0.593362,-0.216414,0.867590
249,2021,9.629000,0.983000,62.409000,0.877000,0.273000,0.888000
1,2020,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489
245,2021,9.520000,0.697000,68.999000,0.785000,-0.030000,0.901000
...,...,...,...,...,...,...,...
70,2020,8.024811,0.835297,64.104591,0.831383,-0.025089,0.592076
132,2020,8.680482,0.784407,58.961712,0.895075,0.560664,0.645124
289,2021,8.145000,0.708000,55.809000,0.782000,0.061000,0.823000
109,2020,9.661096,0.747695,59.903549,0.633183,-0.069565,0.822262


In [60]:
# Reset index
x_train_new_index = X_train.reset_index()
x_train_new_index = x_train_new_index.drop('index',axis=1)

# Rename columns
x_train_new_index = x_train_new_index.rename({'logged_GDP_per_capita':'logged_gdp_per_capita'}, axis='columns')
x_train_new_index

Unnamed: 0,year,logged_gdp_per_capita,support,life_exp,freedom,generosity,corruption
0,2020,10.772559,0.974670,73.000000,0.948892,0.246944,0.711710
1,2020,9.314973,0.688719,66.897858,0.593362,-0.216414,0.867590
2,2021,9.629000,0.983000,62.409000,0.877000,0.273000,0.888000
3,2020,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489
4,2021,9.520000,0.697000,68.999000,0.785000,-0.030000,0.901000
...,...,...,...,...,...,...,...
221,2020,8.024811,0.835297,64.104591,0.831383,-0.025089,0.592076
222,2020,8.680482,0.784407,58.961712,0.895075,0.560664,0.645124
223,2021,8.145000,0.708000,55.809000,0.782000,0.061000,0.823000
224,2020,9.661096,0.747695,59.903549,0.633183,-0.069565,0.822262


## LOAD x_train_new_index to Postgres
- Loaded PostGRES X_train data to be used on app.py. 
- X scaling happens in app.py when user x_value set is retrieved.
- Use schema.sql 

In [50]:
from sqlalchemy import create_engine

# Development Environment - Connect to database 
rds_connection_string = "postgres:postgres@localhost:5432/the_flow_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [57]:
from sqlalchemy import create_engine

# Deployment Environment - Connect to database 
rds_connection_string ="wkzrbnexjxchdc:4a180d07c37d29a607c81ce518488602f8a28a3722363968ce16da6913bd892b@ec2-54-155-92-75.eu-west-1.compute.amazonaws.com:5432/d8eudggrvvqde8"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [58]:
# Add table headings via pgAdmin as per schema

# Run code to ensure connection is established and table has been created on postgres.
engine.table_names()

['happy_table_m2', 'x_test_new_index_table', 'x_train_df']

In [61]:
# Load dataframe (variable 'x_train_new_index') into postgres database 'the_flow_db' OR 'heroku_withtheflow', in the table 'x_train_df'
x_train_new_index.to_sql(name='x_train_df', con=engine, if_exists='append', index=False)

In [63]:
# Run code to check connection is established and data is reading out from postgres database
x_train_df = pd.read_sql_table('x_train_df', engine) 
x_train_df


Unnamed: 0,year,logged_gdp_per_capita,support,life_exp,freedom,generosity,corruption
0,2020.0,10.772559,0.974670,73.000000,0.948892,0.246944,0.711710
1,2020.0,9.314973,0.688719,66.897858,0.593362,-0.216414,0.867590
2,2021.0,9.629000,0.983000,62.409000,0.877000,0.273000,0.888000
3,2020.0,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489
4,2021.0,9.520000,0.697000,68.999000,0.785000,-0.030000,0.901000
...,...,...,...,...,...,...,...
221,2020.0,8.024811,0.835297,64.104591,0.831383,-0.025089,0.592076
222,2020.0,8.680482,0.784407,58.961712,0.895075,0.560664,0.645124
223,2021.0,8.145000,0.708000,55.809000,0.782000,0.061000,0.823000
224,2020.0,9.661096,0.747695,59.903549,0.633183,-0.069565,0.822262


In [64]:
# Assign new variable that will have an additional user input X row
x_train_user_df = x_train_df.copy()

## HAPPINESS PREDICTION 
### TEST with made up user input
- Use code in Flask

In [65]:
# Use example user input
user_inputs = [2021, 11.32, 0.86, 73.9, 0.91, 0.16, 0.44]
# df_length = len(X_test_new_index) 
df_length = 226 # So each time a new user comes into use it, it overwrites the entry at index loc 76.
x_train_user_df.loc[df_length] = user_inputs

#Test user input has been appended to dataframe
x_train_user_df.loc[226]

year                     2021.00
logged_gdp_per_capita      11.32
support                     0.86
life_exp                   73.90
freedom                     0.91
generosity                  0.16
corruption                  0.44
Name: 226, dtype: float64

In [66]:
print(x_train_df.shape)
print(x_train_user_df.shape)

(226, 7)
(227, 7)


In [67]:
x_train_user_df

Unnamed: 0,year,logged_gdp_per_capita,support,life_exp,freedom,generosity,corruption
0,2020.0,10.772559,0.974670,73.000000,0.948892,0.246944,0.711710
1,2020.0,9.314973,0.688719,66.897858,0.593362,-0.216414,0.867590
2,2021.0,9.629000,0.983000,62.409000,0.877000,0.273000,0.888000
3,2020.0,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489
4,2021.0,9.520000,0.697000,68.999000,0.785000,-0.030000,0.901000
...,...,...,...,...,...,...,...
222,2020.0,8.680482,0.784407,58.961712,0.895075,0.560664,0.645124
223,2021.0,8.145000,0.708000,55.809000,0.782000,0.061000,0.823000
224,2020.0,9.661096,0.747695,59.903549,0.633183,-0.069565,0.822262
225,2021.0,10.871000,0.898000,69.600000,0.784000,-0.070000,0.721000


In [68]:
len(x_train_user_df)

227

In [69]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Attain X_scaler using x_train data
X_scaler = MinMaxScaler().fit(x_train_df)
X_scaler
# Save all variables using X_scaler
X_scaled = X_scaler.transform(x_train_user_df)
user_inputs = X_scaled[226]
user_inputs

array([1.        , 0.93655853, 0.76006229, 0.89454004, 0.89536593,
       0.51058798, 0.41773629])

In [70]:
user_inputs = np.array([user_inputs])
user_inputs

array([[1.        , 0.93655853, 0.76006229, 0.89454004, 0.89536593,
        0.51058798, 0.41773629]])

In [79]:
predictions = loaded_model2.predict(user_inputs) 
print(f"Happiness Score Prediction: {predictions}")
predictions

Happiness Score Prediction: [8]


array([8])