# MACHINE LEARNING SECTION
## Data Cleanup: 
### - Save X and y variables
### - Train, Test, Split
### - X-scaler

In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# Retrieve csv file and convert to Pandas dataframe
happy_postg = pd.read_csv("Resources/happy_2020_2021.csv")
happy_postg

Unnamed: 0.1,Unnamed: 0,year,country,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption,score
0,0,2020,Finland,10.639267,0.954330,71.900825,0.949172,-0.059482,0.195445,7.8087
1,1,2020,Denmark,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489,7.6456
2,2,2020,Switzerland,10.979933,0.942847,74.102448,0.921337,0.105911,0.303728,7.5599
3,3,2020,Iceland,10.772559,0.974670,73.000000,0.948892,0.246944,0.711710,7.5045
4,4,2020,Norway,11.087804,0.952487,73.200783,0.955750,0.134533,0.263218,7.4880
...,...,...,...,...,...,...,...,...,...,...
297,297,2021,Lesotho,7.926000,0.787000,48.700000,0.715000,-0.131000,0.915000,3.5120
298,298,2021,Botswana,9.782000,0.784000,59.269000,0.824000,-0.246000,0.801000,3.4670
299,299,2021,Rwanda,7.676000,0.552000,61.400000,0.897000,0.061000,0.167000,3.4150
300,300,2021,Zimbabwe,7.943000,0.750000,56.201000,0.677000,-0.047000,0.821000,3.1450


## Save X and y variables

In [3]:
# Assign X (data) and y (target)
X = happy_postg.drop(["Unnamed: 0","score","country"], axis=1)
print(X)
print('------------------------')

# Categorise y to have 10 categories, 
# 6 difference score categories (3: 2-3, 4: 3-4, 5: 4-5, 6: 5-6, 7: 6-7, 8: 7-8)

happy_postg["score"] = happy_postg["score"].apply(np.ceil)
happy_postg

# 
y = happy_postg["score"]
y=y.astype('int')
print(y)

# Unique y categories (whole values)
print(y.unique())

     year  logged_GDP_per_capita   support   life_exp   freedom  generosity  \
0    2020              10.639267  0.954330  71.900825  0.949172   -0.059482   
1    2020              10.774001  0.955991  72.402504  0.951444    0.066202   
2    2020              10.979933  0.942847  74.102448  0.921337    0.105911   
3    2020              10.772559  0.974670  73.000000  0.948892    0.246944   
4    2020              11.087804  0.952487  73.200783  0.955750    0.134533   
..    ...                    ...       ...        ...       ...         ...   
297  2021               7.926000  0.787000  48.700000  0.715000   -0.131000   
298  2021               9.782000  0.784000  59.269000  0.824000   -0.246000   
299  2021               7.676000  0.552000  61.400000  0.897000    0.061000   
300  2021               7.943000  0.750000  56.201000  0.677000   -0.047000   
301  2021               7.695000  0.463000  52.493000  0.382000   -0.102000   

     corruption  
0      0.195445  
1      0.168489

In [4]:
# Check column names which remain
X_headings = X.columns
X_headings

Index(['year', 'logged_GDP_per_capita', 'support', 'life_exp', 'freedom',
       'generosity', 'corruption'],
      dtype='object')

In [5]:
# Check X and y dataframes' shapes
print(X.shape)
print(y.shape)

(302, 7)
(302,)


## Train, Test, Split

In [6]:
# Dependencies (to split data)
from sklearn.model_selection import train_test_split

# Split the data to attain variables X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)

In [7]:
y_train

3      8
127    5
249    6
1      8
245    6
      ..
70     6
132    5
289    5
109    5
176    7
Name: score, Length: 226, dtype: int64

In [8]:
y_test

112    5
63     6
269    5
39     7
276    5
      ..
214    6
187    7
296    4
46     7
114    5
Name: score, Length: 76, dtype: int64

 ## X-Scaler
 ## Use MinMaxScaler to scale the numerical data

In [23]:
# Perform MinMaxScaler to X_train and X_test
# Categorical columns do not need to be scaled. i.e. y_train and y_test
from sklearn.preprocessing import MinMaxScaler

# Attain X_scaler using x_train data
X_scaler = MinMaxScaler().fit(X_train)

# Scale X variables (X_train and X_test) using X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Save variables for use in Machine Learning models 
### (model1.ipynb, model2.ipynb, model3.ipynb)

In [11]:
%store happy_postg

%store X_train
%store X_test
%store y_train
%store y_test
%store X_train_scaled
%store X_test_scaled
%store X_headings
%store X_scaler

Stored 'happy_postg' (DataFrame)
Stored 'X_train_scaled' (ndarray)
Stored 'X_test_scaled' (ndarray)
Stored 'y_train' (Series)
Stored 'y_test' (Series)
Stored 'X_headings' (Index)
Stored 'X_scaler' (MinMaxScaler)


## PART 3: TRANSFORMATION OF DATA to produce dataframe containing: original features, score and predicted score
    - Clean data to produce dataframe 'y_test': 'index' (original index), 'y_actual', 'y_predicted'
    - Clean data to produce dataframe 'y_train': 'index' (original index), 'y_actual', 'y_predicted'
    - Merge dataframe 'happy_postg' and 'y_test_all' as new dataframe 'happy_predicted'
        - There should be blanks because y_predicted values only exist for 20% test data
        - * happy_postg = happy_all : This will ensure happy_all variable is left as is from Part1.

### Save dataframe 'y_test_all' which contains original_index, y_test, y_test_predicted

In [16]:
# Print y_test dataframe
y_test_m1 = y_test
print(y_test_m1)
print('-----------------------------------')
# PRint y_predicted dataframe
y_test_predicted_m1 = pd.DataFrame(predictions)
print(y_test_predicted_m1)

112    5
63     6
269    5
39     7
276    5
      ..
214    6
187    7
296    4
46     7
114    5
Name: score, Length: 76, dtype: int64
-----------------------------------
    0
0   6
1   6
2   6
3   6
4   6
.. ..
71  6
72  6
73  6
74  6
75  6

[76 rows x 1 columns]


In [17]:
# Reset y_test index 
# (to prepare to merge y_test (actual scores) and y_predicted (predicted scores) dataframes)
y_test_m1 = y_test.reset_index()
y_test_m1

Unnamed: 0,index,score
0,112,5
1,63,6
2,269,5
3,39,7
4,276,5
...,...,...
71,214,6
72,187,7
73,296,4
74,46,7


In [18]:
# Merge y_test and y_test_predicted
y_test_all_m1 = pd.merge(y_test_m1, y_test_predicted_m1, left_index=True, right_index=True, how="inner")
y_test_all_m1


Unnamed: 0,index,score,0
0,112,5,6
1,63,6,6
2,269,5,6
3,39,7,6
4,276,5,6
...,...,...,...
71,214,6,6
72,187,7,6
73,296,4,6
74,46,7,6


In [19]:
# This dataframe is from the test entries
# Rename column
y_test_all_m1 = y_test_all.rename({'score':'y_actual',0:'y_predicted'}, axis='columns')
y_test_all_m1

NameError: name 'y_test_all' is not defined

- Merge dataframe 'happy_postg' with 'y_test_all' on index and save new dataframe as 'happy_postg_final'

In [26]:
# Merge 'happy_postg' on index with 'y_test_all' on column name 'index'. (join=outer)
happy_postg_final_m1 = pd.merge(happy_postg,y_test_all_m1, how="outer", left_index=True, right_on="index")
happy_postg_final_m1 = happy_postg_final_m1.dropna()
happy_postg_final_m1= happy_postg_final_m1.sort_index()
happy_postg_final_m1

Unnamed: 0,year,country,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption,score,index,y_actual,y_predicted
0.0,2020,Gambia,7.321815,0.693169,55.012016,0.733163,0.343199,0.690718,5.0,112,5.0,6.0
1.0,2020,Serbia,9.680981,0.881476,68.210205,0.726496,-0.073676,0.843509,6.0,63,6.0,6.0
2.0,2021,Mali,7.744000,0.724000,51.969000,0.697000,-0.036000,0.827000,5.0,269,5.0,6.0
3.0,2020,Bahrain,10.676380,0.876342,68.500000,0.905856,0.133729,0.739347,7.0,39,7.0,6.0
4.0,2021,Namibia,9.161000,0.818000,56.799000,0.719000,-0.149000,0.847000,5.0,276,5.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
71.0,2021,South Korea,10.651000,0.799000,73.900000,0.672000,-0.083000,0.727000,6.0,214,6.0,6.0
72.0,2021,Brazil,9.577000,0.882000,66.601000,0.804000,-0.071000,0.756000,7.0,187,7.0,6.0
73.0,2021,Malawi,6.958000,0.537000,57.948000,0.780000,0.038000,0.729000,4.0,296,4.0,6.0
74.0,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.934300,7.0,46,7.0,6.0


In [27]:
# Rename column 'index' to 'id'
happy_postg_final_m1 = happy_postg_final_m1.rename({'index':'id'}, axis='columns')

# Reorder column names
happy_postg_final_m1 = happy_postg_final_m1[['id','year','country','logged_GDP_per_capita','support','life_exp','freedom','generosity','corruption', 'y_actual','y_predicted']]
happy_postg_final_m1

Unnamed: 0,id,year,country,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption,y_actual,y_predicted
0.0,112,2020,Gambia,7.321815,0.693169,55.012016,0.733163,0.343199,0.690718,5.0,6.0
1.0,63,2020,Serbia,9.680981,0.881476,68.210205,0.726496,-0.073676,0.843509,6.0,6.0
2.0,269,2021,Mali,7.744000,0.724000,51.969000,0.697000,-0.036000,0.827000,5.0,6.0
3.0,39,2020,Bahrain,10.676380,0.876342,68.500000,0.905856,0.133729,0.739347,7.0,6.0
4.0,276,2021,Namibia,9.161000,0.818000,56.799000,0.719000,-0.149000,0.847000,5.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...
71.0,214,2021,South Korea,10.651000,0.799000,73.900000,0.672000,-0.083000,0.727000,6.0,6.0
72.0,187,2021,Brazil,9.577000,0.882000,66.601000,0.804000,-0.071000,0.756000,7.0,6.0
73.0,296,2021,Malawi,6.958000,0.537000,57.948000,0.780000,0.038000,0.729000,4.0,6.0
74.0,46,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.934300,7.0,6.0


In [28]:
happy_postg_final_m1[happy_postg_final_m1['country']=="Romania"]

Unnamed: 0,id,year,country,logged_GDP_per_capita,support,life_exp,freedom,generosity,corruption,y_actual,y_predicted
74.0,46,2020,Romania,10.107584,0.825162,67.207237,0.842823,-0.197815,0.9343,7.0,6.0
