In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, PowerTransformer
from sklearn.pipeline import Pipeline

In [13]:
df = pd.read_csv('../Dataset/Life Expectancy Data.csv')
df.columns = df.columns.str.strip()

In [14]:
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


**Handle Null Values**

In [15]:
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

**Simple Imputer**

In [16]:
data = df.select_dtypes("number")
simple_imputer = SimpleImputer()
simple_imputed_df = simple_imputer.set_output(transform="pandas").fit_transform(data)

In [17]:
simple_imputed_df.isna().sum()

Year                               0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness  1-19 years               0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64

**data spliting**

In [19]:
data_x = simple_imputed_df.drop(columns="Life expectancy")
data_y = simple_imputed_df["Life expectancy"]

In [20]:
model_knn = KNeighborsRegressor()
model_knn.fit(data_x, data_y)

In [22]:
cross_val_score(model_knn, data_x, data_y, cv=5, verbose=3, scoring="r2")

[CV] END ................................ score: (test=0.119) total time=   0.2s
[CV] END ............................... score: (test=-0.168) total time=   0.0s
[CV] END ............................... score: (test=-0.048) total time=   0.0s
[CV] END ................................ score: (test=0.010) total time=   0.0s
[CV] END ............................... score: (test=-0.018) total time=   0.0s


array([ 0.11933193, -0.16797199, -0.0482762 ,  0.01005066, -0.01779058])

**finding best scaller**

In [23]:
scalers = {
    "StandardScaler": StandardScaler,
    "MinMaxScaler":MinMaxScaler,
    "MaxAbsScaler":MaxAbsScaler,
    "PowerTransformer":PowerTransformer
}

results = []

for name, scaler in scalers.items():
    scaled_data = scaler().fit_transform(data_x)
    knn = KNeighborsRegressor()
    score = cross_val_score(knn, scaled_data, data_y, cv=5, verbose=3, scoring="r2")
    print(f"Scaler: {name} \t score_mean: {score.mean()} \t score_std:{score.std()}", end="\n\n")
    results.append({"Scaler":name, "score_mean":score.mean(), "score_std":score.std()})

[CV] END ................................ score: (test=0.781) total time=   0.0s
[CV] END ................................ score: (test=0.852) total time=   0.0s
[CV] END ................................ score: (test=0.753) total time=   0.0s
[CV] END ................................ score: (test=0.771) total time=   0.0s
[CV] END ................................ score: (test=0.836) total time=   0.0s
Scaler: StandardScaler 	 score_mean: 0.7982895343997172 	 score_std:0.038410923518381594

[CV] END ................................ score: (test=0.810) total time=   0.0s
[CV] END ................................ score: (test=0.839) total time=   0.0s
[CV] END ................................ score: (test=0.788) total time=   0.0s
[CV] END ................................ score: (test=0.787) total time=   0.0s
[CV] END ................................ score: (test=0.810) total time=   0.0s
Scaler: MinMaxScaler 	 score_mean: 0.8066885169315796 	 score_std:0.018868425115837105

[CV] END ...

In [24]:
pd.DataFrame(results)

Unnamed: 0,Scaler,score_mean,score_std
0,StandardScaler,0.79829,0.038411
1,MinMaxScaler,0.806689,0.018868
2,MaxAbsScaler,0.819788,0.022115
3,PowerTransformer,0.823613,0.028324


**HyperParameter Tuning**

In [25]:
pipe = Pipeline([
    ("Scaler", PowerTransformer()),
    ("model", KNeighborsRegressor())
])

grid = RandomizedSearchCV(pipe, {
    "model__n_neighbors": range(5,30,2),
    "model__weights": ["uniform","distance"],
    "model__leaf_size": range(5,50,5),
    "model__p": [1,2,3,4,5]
}, scoring="r2", cv=5, n_jobs=6, verbose=3, n_iter=int(1000/5))

In [26]:
grid.fit(data_x, data_y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


**best_estimator, best_score, best_params**

In [27]:
grid.best_estimator_

In [28]:
grid.best_score_

0.8638604312260394

In [29]:
grid.best_params_

{'model__weights': 'distance',
 'model__p': 1,
 'model__n_neighbors': 23,
 'model__leaf_size': 45}

In [55]:
data_xx = data_x.drop(columns=["GDP","Population","Year","Alcohol", "Total expenditure","thinness 5-9 years",
                              "under-five deaths"])

pipe = Pipeline([
    ("Scaler", PowerTransformer()),
    ("model", KNeighborsRegressor(leaf_size=45, n_neighbors=23, p=1, weights='distance'))
])

score = cross_val_score(pipe, data_xx, data_y, cv=5, scoring='r2')
print("score_mean: ", score.mean())
print("score_std: ", score.std())

score_mean:  0.8711533626308128
score_std:  0.019916372583514464
