In [166]:
import pandas as pd
import numpy as np
import pylab
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn import linear_model
from scipy import stats

In [167]:
# Import the dataset
ridgeData = pd.read_csv('normalTotal.csv', index_col=[0]) # Use normalised values
ridgeData = ridgeData.drop(['Country', 'Region'], axis=1) # Drop country and region data
ridge_x = ridgeData.drop(['Happiness Score'], axis=1) # X - all data apart from Happiness Score
ridge_y = ridgeData['Happiness Score'].values.reshape(-1,1) # y - reshap happiness score to make compatible

# Check shapes
print(ridge_x.shape)
print(ridge_y.shape)

# Split the data sets into training and testing
x_train, x_test, y_train, y_test = train_test_split(ridge_x, ridge_y, test_size=0.3)


(287, 5)
(287, 1)


In [169]:
# Create function for mean absolute percentage error
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#######Ridge Regression########
ridge = Ridge() # Instansiate new ridge instance

# Tune the Hyperparameters using GridSearch
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 100, 1000]} # Regularisation parameters
grid = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=5) # Perform gridsearch
result = grid.fit(x_train, y_train)
print(grid.best_params_) # Display the beest paremeter value
print(grid.best_score_) # Display the best neg mean square score
# Create new model with optimal parameters
best_params = result.best_params_ # Copy best parameters
ridge_tuned = Ridge(alpha=best_params['alpha']) # Instantiate new object with best parameters

# Test new model against orignal with cross validation on training set
scores_norm = cross_val_score(ridge, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'The average mean squared error for ridge is: {scores_norm.mean()}')
scores_tune = cross_val_score(ridge_tuned, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'The average mean squared error for ridge_tune is: {scores_tune.mean()}')

# Fit the models with the testing data
ridge.fit(x_train, y_train)
ridge_tuned.fit(x_train, y_train)

# Non-tuned predictions
train_predict_norm = ridge.predict(x_train)
test_predict_norm = ridge.predict(x_test)
print('The Non-tuned training score is', mean_absolute_percentage_error(train_predict_norm, y_train))
print('The Non-tuned test score is', mean_absolute_percentage_error(test_predict_norm, y_test))

# Tuned predictions
train_predict_tune = ridge_tuned.predict(x_train)
test_predict_tune = ridge_tuned.predict(x_test)
print('The Tuned training score is', mean_absolute_percentage_error(train_predict_tune, y_train))
print('The Tuned test score is', mean_absolute_percentage_error(test_predict_tune, y_test))


{'alpha': 1}
-0.022491653302179563
The average mean squared error for ridge is: -0.022491653302179563
The average mean squared error for ridge_tune is: -0.022491653302179563
The Non-tuned training score is 26.012558376295686
The Non-tuned test score is 22.3147003559226
The Tuned training score is 26.012558376295686
The Tuned test score is 22.3147003559226
