In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import iqr
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor

In [7]:
df = pd.read_csv('../Dataset/Life Expectancy Data.csv')
df.columns = df.columns.str.strip()

In [8]:
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [9]:
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

In [10]:
data = df.select_dtypes("number")
simple_imputer = SimpleImputer()
simple_imputed_df = simple_imputer.set_output(transform="pandas").fit_transform(data)

In [11]:
data_x = simple_imputed_df.drop(columns="Life expectancy")
data_y = simple_imputed_df["Life expectancy"]

In [12]:
decision_model = DecisionTreeRegressor()
decision_model.fit(data_x, data_y)

In [13]:
cross_val_score(decision_model, data_x, data_y, cv=5, verbose=3, scoring="r2")

[CV] END ................................ score: (test=0.842) total time=   0.0s
[CV] END ................................ score: (test=0.802) total time=   0.0s
[CV] END ................................ score: (test=0.848) total time=   0.0s
[CV] END ................................ score: (test=0.813) total time=   0.0s
[CV] END ................................ score: (test=0.810) total time=   0.0s


array([0.84243478, 0.80157975, 0.84810937, 0.81269242, 0.81029204])

In [14]:
scalers = {
    "StandardScaler": StandardScaler,
    "MinMaxScaler":MinMaxScaler,
    "MaxAbsScaler":MaxAbsScaler,
    "PowerTransformer":PowerTransformer
}

results = []

for name, scaler in scalers.items():
    scaled_data = scaler().fit_transform(data_x)
    decision_model = DecisionTreeRegressor()
    score = cross_val_score(decision_model, scaled_data, data_y, cv=5, verbose=3, scoring="r2")
    print(f"Scaler: {name} \t score_mean: {score.mean()} \t score_std:{score.std()}", end="\n\n")
    results.append({"Scaler":name, "score_mean":score.mean(), "score_std":score.std()})

[CV] END ................................ score: (test=0.855) total time=   0.0s
[CV] END ................................ score: (test=0.792) total time=   0.0s
[CV] END ................................ score: (test=0.847) total time=   0.0s
[CV] END ................................ score: (test=0.837) total time=   0.0s
[CV] END ................................ score: (test=0.822) total time=   0.0s
Scaler: StandardScaler 	 score_mean: 0.830692335983106 	 score_std:0.02213762567944005

[CV] END ................................ score: (test=0.847) total time=   0.0s
[CV] END ................................ score: (test=0.797) total time=   0.0s
[CV] END ................................ score: (test=0.845) total time=   0.0s
[CV] END ................................ score: (test=0.841) total time=   0.0s
[CV] END ................................ score: (test=0.794) total time=   0.0s
Scaler: MinMaxScaler 	 score_mean: 0.8248263279636475 	 score_std:0.024011753616374452

[CV] END .....

In [15]:
pd.DataFrame(results)

Unnamed: 0,Scaler,score_mean,score_std
0,StandardScaler,0.830692,0.022138
1,MinMaxScaler,0.824826,0.024012
2,MaxAbsScaler,0.814921,0.017098
3,PowerTransformer,0.819372,0.026518


In [40]:
pipe = Pipeline([
    ("model", DecisionTreeRegressor())
])

grid = RandomizedSearchCV(pipe, {
    "model__max_depth": range(5,30,2),
    "model__criterion" : ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "model__splitter": ["best", "random"]
    # "model__max_leaf_nodes": range(1,10,2)
}, scoring="r2", cv=5, n_jobs=6, verbose=3)

In [41]:
grid.fit(data_x, data_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [36]:
grid.best_score_

0.8433706273832922

In [37]:
grid.best_params_

{'model__splitter': 'best',
 'model__max_depth': 9,
 'model__criterion': 'absolute_error'}

In [24]:
random_forest_model = RandomForestRegressor()
random_forest_model.fit(data_x, data_y)

In [38]:
pipe = Pipeline([
    ("model", RandomForestRegressor())
])

grid = RandomizedSearchCV(pipe, {
    "model__max_depth": range(5,30,2),
    "model__criterion" : ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "model__splitter": ["best", "random"]
    # "model__max_leaf_nodes": range(1,10,2)
}, scoring="r2", cv=5, n_jobs=6, verbose=3)

In [42]:
grid.fit(data_x, data_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [43]:
grid.best_score_

0.8433116390946127

In [44]:
grid.best_params_

{'model__splitter': 'best',
 'model__max_depth': 9,
 'model__criterion': 'poisson'}

In [18]:
cross_val_score(random_forest_model, data_x, data_y, cv=5, verbose=3, scoring="r2")

[CV] END ................................ score: (test=0.940) total time=   9.0s
[CV] END ................................ score: (test=0.882) total time=   6.4s
[CV] END ................................ score: (test=0.917) total time=   6.1s
[CV] END ................................ score: (test=0.910) total time=   6.1s
[CV] END ................................ score: (test=0.917) total time=   6.4s


array([0.93965439, 0.88243586, 0.9168268 , 0.91032768, 0.91674724])

In [21]:
scalers = {
    "StandardScaler": StandardScaler,
    "MinMaxScaler":MinMaxScaler,
    "MaxAbsScaler":MaxAbsScaler,
    "PowerTransformer":PowerTransformer
}

results = []

for name, scaler in scalers.items():
    scaled_data = scaler().fit_transform(data_x)
    random_forest_model = RandomForestRegressor()
    score = cross_val_score(random_forest_model, scaled_data, data_y, cv=5, verbose=3, scoring="r2")
    print(f"Scaler: {name} \t score_mean: {score.mean()} \t score_std:{score.std()}", end="\n\n")
    results.append({"Scaler":name, "score_mean":score.mean(), "score_std":score.std()})

[CV] END ................................ score: (test=0.941) total time=   8.8s
[CV] END ................................ score: (test=0.882) total time=   6.2s
[CV] END ................................ score: (test=0.917) total time=   5.9s
[CV] END ................................ score: (test=0.913) total time=   5.8s
[CV] END ................................ score: (test=0.916) total time=   6.1s
Scaler: StandardScaler 	 score_mean: 0.9135953712322256 	 score_std:0.018919832876012576

[CV] END ................................ score: (test=0.941) total time=   6.0s
[CV] END ................................ score: (test=0.880) total time=   6.1s
[CV] END ................................ score: (test=0.917) total time=   5.8s
[CV] END ................................ score: (test=0.912) total time=   5.9s
[CV] END ................................ score: (test=0.914) total time=   5.9s
Scaler: MinMaxScaler 	 score_mean: 0.9128388254759399 	 score_std:0.019327398268064108

[CV] END ...

In [22]:
pd.DataFrame(results)

Unnamed: 0,Scaler,score_mean,score_std
0,StandardScaler,0.913595,0.01892
1,MinMaxScaler,0.912839,0.019327
2,MaxAbsScaler,0.913007,0.018965
3,PowerTransformer,0.913624,0.019087
