In [20]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, PowerTransformer, MaxAbsScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, SequentialFeatureSelector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from scipy.stats import iqr
from scipy.stats import randint

In [2]:
df = pd.read_csv("../Dataset/Life Expectancy Data.csv")
df.columns = df.columns.str.strip()
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

In [4]:
data = df.select_dtypes("number")
simple_imputer = SimpleImputer()
simple_imputed_df = simple_imputer.set_output(transform="pandas").fit_transform(data)

In [5]:
data_x = simple_imputed_df.drop(columns="Life expectancy")
data_y = simple_imputed_df["Life expectancy"]

In [6]:
model_svr = SVR()
model_svr.fit(data_x, data_y)

In [7]:
cross_val_score(model_svr, data_x, data_y, cv=5, verbose=3, scoring="r2")

[CV] END ............................... score: (test=-0.129) total time=   0.5s
[CV] END ............................... score: (test=-0.099) total time=   0.5s
[CV] END ............................... score: (test=-0.071) total time=   0.5s
[CV] END ............................... score: (test=-0.032) total time=   0.5s
[CV] END ............................... score: (test=-0.195) total time=   0.5s


array([-0.12900907, -0.09858467, -0.0713814 , -0.0319007 , -0.19532563])

In [8]:
scalers = {
    "StandardScaler": StandardScaler,
    "MinMaxScaler":MinMaxScaler,
    "MaxAbsScaler":MaxAbsScaler,
    "PowerTransformer":PowerTransformer
}

results = []

for name, scaler in scalers.items():
    scaled_data = scaler().fit_transform(data_x)
    model_svr = SVR()
    score = cross_val_score(model_svr, scaled_data, data_y, cv=5, verbose=3, scoring="r2")
    print(f"Scaler: {name} \t score_mean: {score.mean()} \t score_std:{score.std()}", end="\n\n")
    results.append({"Scaler":name, "score_mean":score.mean(), "score_std":score.std()})

[CV] END ................................ score: (test=0.854) total time=   0.5s
[CV] END ................................ score: (test=0.850) total time=   0.5s
[CV] END ................................ score: (test=0.807) total time=   0.4s
[CV] END ................................ score: (test=0.761) total time=   0.3s
[CV] END ................................ score: (test=0.785) total time=   0.3s
Scaler: StandardScaler 	 score_mean: 0.8113922754236048 	 score_std:0.036360977598185566

[CV] END ................................ score: (test=0.844) total time=   0.3s
[CV] END ................................ score: (test=0.831) total time=   0.3s
[CV] END ................................ score: (test=0.830) total time=   0.3s
[CV] END ................................ score: (test=0.799) total time=   0.3s
[CV] END ................................ score: (test=0.822) total time=   0.3s
Scaler: MinMaxScaler 	 score_mean: 0.825285340060011 	 score_std:0.014663481203624736

[CV] END ....

In [9]:
pd.DataFrame(results)

Unnamed: 0,Scaler,score_mean,score_std
0,StandardScaler,0.811392,0.036361
1,MinMaxScaler,0.825285,0.014663
2,MaxAbsScaler,0.821372,0.014183
3,PowerTransformer,0.850926,0.020616


In [10]:
def cap_data(series):
    iqr_ = iqr(series) 
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)

    lb = q1 - 0.8*iqr_
    ub = q1 + 0.8*iqr_
    return np.where(series< lb, lb, np.where(series > ub, ub, series))

def cap_df_data(df):
    return df.select_dtypes("number").apply(lambda x:cap_data(x))

def capping_country(df):
    countries_group = df.groupby("Country")
    return countries_group.apply(lambda cdf : cap_df_data(cdf))

In [47]:
new_df = capping_country(df)

In [48]:
new_df.reset_index().head()

Unnamed: 0,Country,level_1,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,0,2009.75,58.805,263.0,62.0,0.01,59.576883,65.0,1154.0,...,13.75,8.16,59.35,0.1,482.900948,18382856.35,17.2,17.3,0.4408,9.005
1,Afghanistan,1,2009.75,58.805,271.0,64.0,0.01,59.576883,62.0,492.0,...,57.75,8.18,59.35,0.1,482.900948,327582.0,17.5,17.5,0.4408,9.005
2,Afghanistan,2,2009.75,58.805,268.0,66.0,0.01,59.576883,64.0,430.0,...,57.75,8.13,59.35,0.1,482.900948,18382856.35,17.7,17.7,0.4408,9.005
3,Afghanistan,3,2009.75,58.805,272.0,69.0,0.01,59.576883,65.4,2455.45,...,57.75,8.52,59.35,0.1,482.900948,3696958.0,17.9,18.0,0.4408,9.005
4,Afghanistan,4,2009.75,58.805,275.0,71.0,0.01,7.097109,65.4,2455.45,...,57.75,7.87,59.35,0.1,63.537231,2978599.0,18.2,18.2,0.4408,9.005


In [49]:
df2 = new_df.dropna(axis=0)
df2.isna().sum()

Year                               0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness  1-19 years               0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64

In [50]:
df2.reset_index().head()

Unnamed: 0,Country,level_1,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,0,2009.75,58.805,263.0,62.0,0.01,59.576883,65.0,1154.0,...,13.75,8.16,59.35,0.1,482.900948,18382856.35,17.2,17.3,0.4408,9.005
1,Afghanistan,1,2009.75,58.805,271.0,64.0,0.01,59.576883,62.0,492.0,...,57.75,8.18,59.35,0.1,482.900948,327582.0,17.5,17.5,0.4408,9.005
2,Afghanistan,2,2009.75,58.805,268.0,66.0,0.01,59.576883,64.0,430.0,...,57.75,8.13,59.35,0.1,482.900948,18382856.35,17.7,17.7,0.4408,9.005
3,Afghanistan,3,2009.75,58.805,272.0,69.0,0.01,59.576883,65.4,2455.45,...,57.75,8.52,59.35,0.1,482.900948,3696958.0,17.9,18.0,0.4408,9.005
4,Afghanistan,4,2009.75,58.805,275.0,71.0,0.01,7.097109,65.4,2455.45,...,57.75,7.87,59.35,0.1,63.537231,2978599.0,18.2,18.2,0.4408,9.005


In [57]:
data_x = df2.drop(columns=["Life expectancy"])
data_y = df2["Life expectancy"]

pipe = Pipeline([
    ("Scaler", MaxAbsScaler()),
    ("k_best", SelectKBest(f_regression)),
    ("model", SVR()),
])

grid = RandomizedSearchCV(pipe, {
    "k_best__k": [4, 5, 6],
    "model__gamma": ["scale", "auto", 0.001, 0.01, 0.1, 1],
    "model__C": randint(40, 150),
    "model__epsilon": [0.1, 0.2, 0.5, 1.0],
    "model__kernel": ["linear", "poly", "rbf", "sigmoid"],
    "model__degree": [1, 2, 3],
    "model__max_iter": randint(60000, 100000),
}, scoring="r2", cv=2, n_jobs=6, verbose=3, n_iter=int(500))

In [58]:
grid.fit(data_x, data_y)

Fitting 2 folds for each of 500 candidates, totalling 1000 fits


In [59]:
grid.best_score_

0.8967797943028915

In [35]:
grid.best_params_

{'k_best__k': 5,
 'model__C': 79,
 'model__degree': 2,
 'model__epsilon': 1.0,
 'model__gamma': 'scale',
 'model__kernel': 'rbf',
 'model__max_iter': 93268}

In [79]:
new_dfx = new_df[~new_df["Life expectancy"].isna()]

data_x = new_dfx.drop(columns=["Life expectancy"])
data_y = new_dfx["Life expectancy"]

pipe = Pipeline([
    ("imputer", KNNImputer()),
    ("Scaler", MaxAbsScaler()),
    ("k_best", SelectKBest(f_regression)),
    ("model", SVR()),
])

grid = RandomizedSearchCV(pipe, {
    "imputer__n_neighbors": range(5,10,2),
    "k_best__k": [6, 7, 8],
    "model__gamma": ["scale", "auto", 0.001, 0.01, 0.1, 1],
    "model__C": randint(70,150),
    "model__epsilon": [0.1, 0.2, 0.5, 1.0],
    "model__kernel": ["linear", "poly", "rbf", "sigmoid"],
    "model__degree": [1],
    "model__max_iter": randint(60000, 100000),
}, scoring="r2", cv=2, n_jobs=6, verbose=3, n_iter=int(500/2))

In [80]:
grid.fit(data_x, data_y)

Fitting 2 folds for each of 250 candidates, totalling 500 fits


In [81]:
grid.best_score_

0.9159637727759349

In [82]:
grid.best_params_

{'imputer__n_neighbors': 5,
 'k_best__k': 7,
 'model__C': 97,
 'model__degree': 1,
 'model__epsilon': 0.1,
 'model__gamma': 1,
 'model__kernel': 'rbf',
 'model__max_iter': 73954}

In [112]:
grid = GridSearchCV(pipe, {
    "imputer__n_neighbors": range(5,10,2),
    "k_best__k": [7],
    "model__gamma": ["scale", 0.1, 1],
    "model__C": range(40,60,5),
    "model__epsilon": [0.1,0.2],
    "model__kernel": ["linear","rbf"],
    "model__degree": [1],
}, scoring="r2", cv=2, n_jobs=6, verbose=3)

In [113]:
grid.fit(data_x, data_y)

Fitting 2 folds for each of 144 candidates, totalling 288 fits


In [114]:
grid.best_params_

{'imputer__n_neighbors': 5,
 'k_best__k': 7,
 'model__C': 50,
 'model__degree': 1,
 'model__epsilon': 0.2,
 'model__gamma': 'scale',
 'model__kernel': 'rbf'}

In [115]:
grid.best_score_

0.9176448842107432