# IMPORTS

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import math
import os


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# from sklearn.datasets import load_iris
# from sklearn import svm
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

* * *

# LOAD CLEANED DATASETS
Let's read our cleaned files from `Datasets/Cleaned_Datasets` directory

To learn more about how we cleaned the data, you may want to visit `data_cleaning.py` and `exploratory_data_analysis.ipynb`

In [2]:
# Use Pandas to read CSV files and store each dataframe into a variable
world_happiness_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_world_happiness.csv", header=0)
covid_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_covid.csv", header=0)
clean_drinking_water_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_drinking_water_services.csv", header=0)
crude_suicide_rates_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_crude_suicide_rates.csv", header=0)
medical_doctors_df = pd.read_csv("./Datasets/Cleaned_Datasets/cleaned_medical_doctors.csv", header=0)

#takea  look at world happiness data
world_happiness_df.head(5)

Unnamed: 0,country,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption
0,Afghanistan,3.594628,7.650843,0.508245,52.266667,0.518012,0.843283
1,Albania,5.019427,9.384397,0.716316,67.546154,0.66283,0.86936
2,Algeria,5.389717,9.328897,0.803582,65.29,0.519009,0.690871
3,Angola,4.420299,8.989725,0.737973,53.55,0.455957,0.867018
4,Argentina,6.310166,10.033868,0.904423,67.9,0.768254,0.841997


* * *

# Prepare a new dataset

Make a new dataset that will combine `clean_drinking_water`, `crude_suicide_rates`, and `medical_doctors` as features into the `world_happiness_data`

In [3]:
# merge 3 datasets with world_happiness_df, the merging performed here occurs similar to SQL inner join
main_dataset = pd.merge(world_happiness_df, clean_drinking_water_df, how='inner', on="country")
main_dataset = pd.merge(main_dataset, crude_suicide_rates_df, how='inner', on="country")
main_dataset = pd.merge(main_dataset, medical_doctors_df, how='inner', on="country")
main_dataset.head(15)

Unnamed: 0,country,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption,clean_water_per_100_people,suicide_rate_per_100000_people,doctors_per_10000_people
0,Afghanistan,3.594628,7.650843,0.508245,52.266667,0.518012,0.843283,37.755,4.3,2.3225
1,Albania,5.019427,9.384397,0.716316,67.546154,0.66283,0.86936,84.061667,5.193333,13.092727
2,Algeria,5.389717,9.328897,0.803582,65.29,0.519009,0.690871,86.305,2.88,14.45
3,Angola,4.420299,8.989725,0.737973,53.55,0.455957,0.867018,24.313889,5.2,1.1675
4,Argentina,6.310166,10.033868,0.904423,67.9,0.768254,0.841997,87.365556,7.28,34.327143
5,Armenia,4.513624,9.270409,0.71862,65.742857,0.563791,0.846484,94.861667,4.24,20.320741
6,Australia,7.282024,10.755507,0.947253,72.692857,0.921648,0.415422,99.566111,10.36,31.455333
7,Austria,7.242227,10.886958,0.9296,72.103077,0.906196,0.570189,100.0,13.993333,41.785357
8,Azerbaijan,4.940989,9.519592,0.770649,63.942857,0.662107,0.69833,69.972778,2.246667,36.266
9,Bahrain,6.001723,10.730848,0.880093,67.594546,0.861467,0.553173,99.969444,4.633333,9.936875


In [4]:
# Summary of the new dataset
display(main_dataset.describe())
display(main_dataset.info())
display(main_dataset.isnull().sum(axis = 0))

Unnamed: 0,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption,clean_water_per_100_people,suicide_rate_per_100000_people,doctors_per_10000_people
count,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0
mean,5.424848,9.301535,0.805777,62.80544,0.741846,0.733355,77.711177,8.415738,16.01667
std,1.103782,1.212501,0.118027,7.636191,0.125386,0.183885,23.426713,5.965113,13.259219
min,3.514954,6.72251,0.402559,43.356001,0.451014,0.097752,21.435,1.34,0.226
25%,4.46538,8.297707,0.745606,56.967857,0.662081,0.690242,59.039722,4.233333,2.963958
50%,5.309331,9.396615,0.826084,65.065667,0.745535,0.794771,85.709444,6.813333,14.540263
75%,6.263767,10.289785,0.902506,67.945536,0.83222,0.845422,98.410417,10.375,27.5188
max,7.680305,11.607032,0.977578,75.358461,0.954373,0.953186,100.0,34.5,43.590345


<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 139
Data columns (total 10 columns):
country                           140 non-null object
happiness_score                   140 non-null float64
gdp_per_capita                    140 non-null float64
social_support                    140 non-null float64
life_expectancy                   140 non-null float64
freedom                           140 non-null float64
corruption                        140 non-null float64
clean_water_per_100_people        140 non-null float64
suicide_rate_per_100000_people    140 non-null float64
doctors_per_10000_people          140 non-null float64
dtypes: float64(9), object(1)
memory usage: 12.0+ KB


None

country                           0
happiness_score                   0
gdp_per_capita                    0
social_support                    0
life_expectancy                   0
freedom                           0
corruption                        0
clean_water_per_100_people        0
suicide_rate_per_100000_people    0
doctors_per_10000_people          0
dtype: int64

* * *

# IMPLEMENT MACHINE LEARNING MODELS

## Splitting the dataset

First we need to split the datasets into features and labels

* The label (y) is  `happiness_score`
* The features (X) include 6 different variables/columns:
     * `gdp_per_capita`
     * `social_support`
     * `life_expectancy`
     * `freedom`
     * `clean_water_per_100_people`
     * `doctors_per_10000_people`

Our data more columns/features that we will skip for now and not include as features for our ML models. We do not want to overfit our model as we know that more features will lead to high complexity of our model

In [9]:
# Take the pandas dataset and split it into our features (X) and label (y)

#features (X)
X = main_dataset[["gdp_per_capita", "social_support", "life_expectancy", "freedom", 
                  "clean_water_per_100_people", "doctors_per_10000_people" ]]

#label (y)
y = main_dataset["happiness_score"]

# Use sklearn to split the features and labels into a training/test set. (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Perform a Linear Regression

In [10]:
# Use sklearn to train a model on the training set
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
# Create a sample datapoint and predict the output of that sample with the trained model
y_pred = LR_model.predict(X_test)

In [13]:
compare_actual_predicted = pd.DataFrame({"Actual" : y_test, 'Predicted' : y_pred})
compare_actual_predicted.head()

Unnamed: 0,Actual,Predicted
41,7.597216,6.894449
34,4.822565,4.512292
1,5.019427,5.321928
16,5.190686,5.055295
119,6.269287,5.875563


In [14]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means
score = LR_model.score(X_test, y_test)
print("Score = ", score)


Score =  0.8172107652031606
