In [170]:
import pandas as pd
import numpy as np
import pylab
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn import linear_model
from scipy import stats

In [171]:
# Import the dataset
forestData = pd.read_csv('normalTotal.csv', index_col=[0]) # Use normalised values
forestData = forestData.drop(['Country', 'Region'], axis=1) # Drop country and region data
forest_x = forestData.drop(['Happiness Score'], axis=1) # X - all data apart from Happiness Score
forest_y = forestData['Happiness Score'].values.reshape(-1,1) # y - reshap happiness score to make compatible

# Check shapes
print(forest_x.shape)
print(forest_y.shape)

# Split the data sets into training and testing
x_train, x_test, y_train, y_test = train_test_split(forest_x, forest_y, test_size=0.25)

(287, 5)
(287, 1)


In [172]:
# Create function for mean absolute percentage error
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#######forest Regression########
forest = RandomForestRegressor() # Instansiate new forest instance

# Tune the Hyperparameters using GridSearch
param_grid={'max_depth': range(2,8), 'n_estimators': (10, 50, 100, 1000)} # Regularisation parameters
grid = GridSearchCV(forest, param_grid, scoring='neg_mean_squared_error', cv=4) # Perform gridsearch
result = grid.fit(x_train, y_train)
print(grid.best_params_) # Display the beest paremeter value
print(grid.best_score_) # Display the best neg mean square score
# Create new model with optimal parameters
best_params = result.best_params_
forest_tuned = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"])

# Test new model against orignal with cross validation on training set
scores_norm = cross_val_score(forest, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'The average mean squared error for forest is: {scores_norm.mean()}')
scores_tune = cross_val_score(forest_tuned, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'The average mean squared error for forest_tune is: {scores_tune.mean()}')

# Fit the models with the testing data
forest.fit(x_train, y_train)
forest_tuned.fit(x_train, y_train)

# Non-tuned predictions
train_predict_norm = forest.predict(x_train)
test_predict_norm = forest.predict(x_test)
print('The Non-tuned training score is', mean_absolute_percentage_error(train_predict_norm, y_train))
print('The Non-tuned test score is', mean_absolute_percentage_error(test_predict_norm, y_test))

# Tuned predictions
train_predict_tune = forest_tuned.predict(x_train)
test_predict_tune = forest_tuned.predict(x_test)
print('The Tuned training score is', mean_absolute_percentage_error(train_predict_tune, y_train))
print('The Tuned test score is', mean_absolute_percentage_error(test_predict_tune, y_test))



{'max_depth': 3, 'n_estimators': 50}
-0.015316739870419106
The average mean squared error for forest is: -0.014501548398236835
The average mean squared error for forest_tune is: -0.01521064933705433
The Non-tuned training score is 68.74338322522364
The Non-tuned test score is 56.57585228243385
The Tuned training score is 61.18588219548589
The Tuned test score is 56.8333348630748
