# IMPORTS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import math
import os
from IPython.display import Markdown as md
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import BayesianRidge

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

import warnings
warnings.filterwarnings('ignore')

* * *

# LOAD CLEANED DATASETS
Let's read our cleaned files from `Datasets/Cleaned_Datasets` directory

To learn more about how we cleaned the data, you may want to visit `data_cleaning.py` and `exploratory_data_analysis.ipynb`

In [2]:
# Use Pandas to read CSV files and store each dataframe into a variable
world_happiness_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_world_happiness.csv", header=0)
covid_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_covid.csv", header=0)
clean_drinking_water_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_drinking_water_services.csv", header=0)
crude_suicide_rates_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_crude_suicide_rates.csv", header=0)
medical_doctors_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_medical_doctors.csv", header=0)

#takea  look at world happiness data
world_happiness_df.head(5)

Unnamed: 0,country,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption
0,Afghanistan,3.594628,7.650843,0.508245,52.266667,0.518012,0.843283
1,Albania,5.019427,9.384397,0.716316,67.546154,0.66283,0.86936
2,Algeria,5.389717,9.328897,0.803582,65.29,0.519009,0.690871
3,Angola,4.420299,8.989725,0.737973,53.55,0.455957,0.867018
4,Argentina,6.310166,10.033868,0.904423,67.9,0.768254,0.841997


* * *

# Prepare a new dataset

Make a new dataset that will combine `clean_drinking_water`, `crude_suicide_rates`, and `medical_doctors` as features into the `world_happiness_data`

In [3]:
# merge 3 datasets with world_happiness_df, the merging performed here occurs similar to SQL inner join
main_dataset = pd.merge(world_happiness_df, clean_drinking_water_df, how='inner', on="country")
main_dataset = pd.merge(main_dataset, crude_suicide_rates_df, how='inner', on="country")
main_dataset = pd.merge(main_dataset, medical_doctors_df, how='inner', on="country")

#Also export this dataframe as csv into Clean_Dataset directory after preparing
main_dataset.to_csv('./Datasets/Cleaned_Datasets/full_dataset.csv', index = False)
main_dataset.head(15)

Unnamed: 0,country,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption,clean_water_per_100_people,suicide_rate_per_100000_people,doctors_per_10000_people
0,Afghanistan,3.594628,7.650843,0.508245,52.266667,0.518012,0.843283,37.755,4.3,2.3225
1,Albania,5.019427,9.384397,0.716316,67.546154,0.66283,0.86936,84.061667,5.193333,13.092727
2,Algeria,5.389717,9.328897,0.803582,65.29,0.519009,0.690871,86.305,2.88,14.45
3,Angola,4.420299,8.989725,0.737973,53.55,0.455957,0.867018,24.313889,5.2,1.1675
4,Argentina,6.310166,10.033868,0.904423,67.9,0.768254,0.841997,87.365556,7.28,34.327143
5,Armenia,4.513624,9.270409,0.71862,65.742857,0.563791,0.846484,94.861667,4.24,20.320741
6,Australia,7.282024,10.755507,0.947253,72.692857,0.921648,0.415422,99.566111,10.36,31.455333
7,Austria,7.242227,10.886958,0.9296,72.103077,0.906196,0.570189,100.0,13.993333,41.785357
8,Azerbaijan,4.940989,9.519592,0.770649,63.942857,0.662107,0.69833,69.972778,2.246667,36.266
9,Bahrain,6.001723,10.730848,0.880093,67.594546,0.861467,0.553173,99.969444,4.633333,9.936875


In [4]:
# Summary of the new dataset
display(main_dataset.describe())
display(main_dataset.info())
display(main_dataset.isnull().sum(axis = 0))

Unnamed: 0,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption,clean_water_per_100_people,suicide_rate_per_100000_people,doctors_per_10000_people
count,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0
mean,5.424848,9.301535,0.805777,62.80544,0.741846,0.733355,77.711177,8.415738,16.01667
std,1.103782,1.212501,0.118027,7.636191,0.125386,0.183885,23.426713,5.965113,13.259219
min,3.514954,6.72251,0.402559,43.356001,0.451014,0.097752,21.435,1.34,0.226
25%,4.46538,8.297707,0.745606,56.967857,0.662081,0.690242,59.039722,4.233333,2.963958
50%,5.309331,9.396615,0.826084,65.065667,0.745535,0.794771,85.709444,6.813333,14.540263
75%,6.263767,10.289785,0.902506,67.945536,0.83222,0.845422,98.410417,10.375,27.5188
max,7.680305,11.607032,0.977578,75.358461,0.954373,0.953186,100.0,34.5,43.590345


<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 139
Data columns (total 10 columns):
country                           140 non-null object
happiness_score                   140 non-null float64
gdp_per_capita                    140 non-null float64
social_support                    140 non-null float64
life_expectancy                   140 non-null float64
freedom                           140 non-null float64
corruption                        140 non-null float64
clean_water_per_100_people        140 non-null float64
suicide_rate_per_100000_people    140 non-null float64
doctors_per_10000_people          140 non-null float64
dtypes: float64(9), object(1)
memory usage: 12.0+ KB


None

country                           0
happiness_score                   0
gdp_per_capita                    0
social_support                    0
life_expectancy                   0
freedom                           0
corruption                        0
clean_water_per_100_people        0
suicide_rate_per_100000_people    0
doctors_per_10000_people          0
dtype: int64

* * *

# MACHINE LEARNING MODELS

Here is the list of all the different types of ML Regression models that we will build in this notebook:

#### * Linear Regression
#### * ElasticNet Regression
#### * Ridge Regression
#### * Lasso Regression  

*** 

## Splitting the dataset

First we need to split the datasets into features and labels

* The label (y) is  `happiness_score`
* The features (X) include 6 different variables/columns:
     * `gdp_per_capita`
     * `social_support`
     * `life_expectancy`
     * `freedom`
     * `clean_water_per_100_people`
     * `doctors_per_10000_people`

Our data contains more columns/features that we will skip for now and not include as features for our ML models. We do not want to overfit our model as we know that more features will lead to high complexity of our model

In [5]:
# Take the pandas dataset and split it into our features (X) and label (y)

#features (X)
X = main_dataset[["gdp_per_capita", "social_support", "life_expectancy", "freedom", 
                  "clean_water_per_100_people", "doctors_per_10000_people" ]]

#label (y)
y = main_dataset["happiness_score"]

# Use sklearn to split the features and labels into a training/test set. (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print("S shape : ",X.shape)
print("y shape : ",y.shape)

S shape :  (140, 6)
y shape :  (140,)


***

# Linear Regression

#### Make an object of the class LinearRegression followed by fit method which fits the regressor to the training data

In [6]:
# Use sklearn to train a model on the training set
LinearR_model = LinearRegression()
LinearR_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Call predict method on the actual dataset

In [7]:
# Create a sample datapoint and predict the output of that sample with the trained model
LinearR_y_pred = LinearR_model.predict(X_test)

#### Make a dataframe to compare the actual vs predicted

In [8]:
LinearR_compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : LinearR_y_pred})
LinearR_compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
82,5.863883,6.088346
86,5.019815,4.843354
67,6.273742,6.367381
110,4.112052,3.882882
139,3.882689,4.201202


#### Score of our Linear Regression Model

In [9]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means
LinearR_score = LinearR_model.score(X_test, y_test)
LinearR_score_percentage = LinearR_score*100
print("Score = ", LinearR_score)

Score =  0.8790313446351949


In [10]:
md("What does the score here mean?\n\nThe scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `%0.1f` means that the model has `%0.1f percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: \n * gdp_per_capita\n * social_support\n * life_expectancy\n * freedom\n * clean_water_per_100_people\n * doctors_per_10000_people "%(LinearR_score_percentage,LinearR_score_percentage))

What does the score here mean?

The scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `87.9` means that the model has `87.9 percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: 
 * gdp_per_capita
 * social_support
 * life_expectancy
 * freedom
 * clean_water_per_100_people
 * doctors_per_10000_people 

#### Coefficients and Intercept

In [11]:
print("Coefficients : ", LinearR_model.coef_)
print("Intercept = ", LinearR_model.intercept_)

Coefficients :  [0.32586453 1.93571859 0.0251739  1.97330806 0.00243444 0.00295645]
Intercept =  -2.4669897850485754


### Use Cross Validation

Use the cross_val_score function to repeat the experiment across many shuffles of the data

In [12]:
# Use the cross_val_score function to repeat the experiment across many shuffles of the data

In [13]:
#We will choose to use cv = 10 that means 10 folds of data

cv_results = cross_val_score(LinearR_model, X_train, y_train,cv=10)
cv_results_mean, cv_results_std = cv_results.mean(), cv_results.std()
print("Cross Validation results : ",cv_results)

Cross Validation results :  [0.79671275 0.88005688 0.76956663 0.54303794 0.70066289 0.67977733
 0.86890101 0.73026655 0.77091672 0.7182542 ]


In [14]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_results_mean, cv_results_std))

0.75 accuracy with a standard deviation of 0.09


#### Significance of using cross validation

When computing R squared on the test set, the R squared is dependent on the way you split up the data. The data points in the test set may have anomalies that which implies that the R-squared computed is not representative of the model's ability to generalize the unseen data. Cross validation here split the dataset into 10 folds/groups. This essentially took the first fold as a test set and fit the model on the remaining 9 folds. Then it predicted on the test set. This repeats for total 10 times. Eventually giving us an array of cross-validation scores. Therefore giving us a more accurate score value


***

# ElasticNet Regression

#### Make an object of the class ElasticNet Regression followed by fit method which fits the regressor to the training data

In [15]:
# Use sklearn to train a model on the training set
ER_model = ElasticNet(alpha=0, l1_ratio=0,random_state=42)
ER_model.fit(X_train, y_train)

ElasticNet(alpha=0, copy_X=True, fit_intercept=True, l1_ratio=0,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

#### Call predict method on the actual dataset

In [16]:
# Create a sample datapoint and predict the output of that sample with the trained model
ER_y_pred = ER_model.predict(X_test)

#### Make a dataframe to compare the actual vs predicted

In [17]:
ER_compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : ER_y_pred})
ER_compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
82,5.863883,6.088346
86,5.019815,4.843354
67,6.273742,6.367381
110,4.112052,3.882882
139,3.882689,4.201202


#### Score of our ElasticNet Regression Model

In [18]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means
ER_score = ER_model.score(X_test, y_test)
ER_score_percentage = ER_score*100
print("Score = ", ER_score)

Score =  0.8790313446351953


In [19]:
md("What does the score here mean?\n\nThe scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `%0.1f` means that the model has `%0.1f percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: \n * gdp_per_capita\n * social_support\n * life_expectancy\n * freedom\n * clean_water_per_100_people\n * doctors_per_10000_people "%(ER_score_percentage,ER_score_percentage))

What does the score here mean?

The scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `87.9` means that the model has `87.9 percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: 
 * gdp_per_capita
 * social_support
 * life_expectancy
 * freedom
 * clean_water_per_100_people
 * doctors_per_10000_people 

#### Coefficients and Intercept

In [20]:
print("Coefficients : ", ER_model.coef_)
print("Intercept = ", ER_model.intercept_)

Coefficients :  [0.32586453 1.93571859 0.0251739  1.97330806 0.00243444 0.00295645]
Intercept =  -2.4669897850485327


# Ridge Regression
#### Make an object of the class RidgeRegression followed by fit method which fits the regressor to the training data

In [21]:
# Use sklearn to train a model on the training set
RR_model = Ridge(alpha=100)
RR_model.fit(X_train, y_train)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### Call predict method on the actual dataset

In [22]:
# Create a sample datapoint and predict the output of that sample with the trained model
RR_y_pred = RR_model.predict(X_test)

#### Make a dataframe to compare the actual vs predicted

In [23]:
RR_compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : RR_y_pred})
RR_compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
82,5.863883,5.776551
86,5.019815,5.380029
67,6.273742,5.920714
110,4.112052,3.565069
139,3.882689,4.029957


#### Score of our Ridge Regression Model

In [24]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means
RR_score = RR_model.score(X_test, y_test)
RR_score_percentage = RR_score*100
print("Score = ", RR_score)

Score =  0.7655558844286632


In [25]:
md("What does the score here mean?\n\nThe scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `%0.1f` means that the model has `%0.1f percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: \n * gdp_per_capita\n * social_support\n * life_expectancy\n * freedom\n * clean_water_per_100_people\n * doctors_per_10000_people "%(RR_score_percentage,RR_score_percentage))

What does the score here mean?

The scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `76.6` means that the model has `76.6 percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: 
 * gdp_per_capita
 * social_support
 * life_expectancy
 * freedom
 * clean_water_per_100_people
 * doctors_per_10000_people 

#### Coefficients and Intercept

In [26]:
print("Coefficients : ", RR_model.coef_)
print("Intercept = ", RR_model.intercept_)

Coefficients :  [0.1314858  0.02507267 0.07981742 0.03638172 0.00169572 0.00843819]
Intercept =  -1.1574320168484507


# Lasso Regression

#### Make an object of the class LassoRegression followed by fit method which fits the regressor to the training data

In [27]:
# Use sklearn to train a model on the training set
LR_model = Lasso(alpha=1.0)
LR_model.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

#### Call predict method on the actual dataset

In [28]:
# Create a sample datapoint and predict the output of that sample with the trained model
LR_y_pred = LR_model.predict(X_test)

#### Make a dataframe to compare the actual vs predicted

In [29]:
LR_compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : LR_y_pred})
LR_compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
82,5.863883,5.830253
86,5.019815,5.017397
67,6.273742,5.878304
110,4.112052,3.809503
139,3.882689,4.290565


#### Score of our Lasso Regression Model

In [30]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means
LR_score = LR_model.score(X_test, y_test)
LR_score_percentage = LR_score*100
print("Score = ", LR_score)

Score =  0.7260747903039244


In [31]:
md("What does the score here mean?\n\nThe scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `%0.1f` means that the model has `%0.1f percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: \n * gdp_per_capita\n * social_support\n * life_expectancy\n * freedom\n * clean_water_per_100_people\n * doctors_per_10000_people "%(LR_score_percentage,LR_score_percentage))

What does the score here mean?

The scoring method for this linear regression is R squared. This metric quantifies the amount of variance in the target variable that is predicted from the feature variables. Here `72.6` means that the model has `72.6 percent`  accuracy predicting the `World Happiness` based upon the following 6 variables: 
 * gdp_per_capita
 * social_support
 * life_expectancy
 * freedom
 * clean_water_per_100_people
 * doctors_per_10000_people 

#### Coefficients and Intercept

In [32]:
print("Coefficients : ", LR_model.coef_)
print("Intercept = ", LR_model.intercept_)

Coefficients :  [0.         0.         0.04977819 0.         0.01316393 0.01132003]
Intercept =  1.0504513623749165
