In [18]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import scipy
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import PowerTransformer
from sklearn_pandas import DataFrameMapper

pd.set_option('display.max_rows', 200)

In [19]:
df = pd.read_excel("../input/data2.xlsx")
x = list(df.columns.values)
x.remove('Country Name')
df2 = df.copy()
mapper = DataFrameMapper([(df2[x].columns, PowerTransformer())])
df2[x] = mapper.fit_transform(df2[x])
df2["LE > median"]=df2["Life expectancy"].apply(lambda x: 1 if (x>(np.median(df2["Life expectancy"]))) else 0)
X_train, X_test, Y_train, Y_test = train_test_split(df2.drop(labels=['Country Name', 'Life expectancy', 'LE > median'], axis = 1),
                                                    df2['LE > median'], test_size=0.25, random_state=17, shuffle=True)

In [20]:
model = XGBClassifier(n_estimators = 500, silent = True)
parameters = {'max_depth': range(1,15)}
XGB = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
XGB.fit(X_train, Y_train)
print("training acc " + str(XGB.score(X_train, Y_train)))
print("test acc " + str(XGB.score(X_test, Y_test)))
print(XGB.best_params_)

Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   13.6s
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   16.4s finished


training acc 1.0
test acc 0.9773371104815864
{'max_depth': 11}


In [21]:
model = CatBoostClassifier(iterations=50, silent = True)
parameters = {'depth': range(4,10)}
CBC = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
CBC.fit(X_train, Y_train)
print("training acc " + str(CBC.score(X_train, Y_train)))
print("test acc " + str(CBC.score(X_test, Y_test)))
print('')
print(CBC.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed:   34.3s finished


training acc 0.9985822306238186
test acc 0.9787535410764873

{'depth': 9}


In [26]:
model=AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=100)
parameters = {'base_estimator__max_depth': range(1,10)}
ABC = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
ABC.fit(X_train, Y_train)
print("training acc " + str(ABC.score(X_train, Y_train)))
print("test acc " + str(ABC.score(X_test, Y_test)))
print('')
print(ABC.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    8.1s finished


training acc 1.0
test acc 0.9759206798866855

{'base_estimator__max_depth': 8}


In [32]:
model=LGBMClassifier()
parameters = {'max_depth': range(1,25)}
LBoost = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
LBoost.fit(X_train, Y_train)
print("training acc " + str(LBoost.score(X_train, Y_train)))
print("test acc " + str(LBoost.score(X_test, Y_test)))
print('')
print(LBoost.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed:    5.3s finished


training acc 1.0
test acc 0.9759206798866855

{'max_depth': 15}


Light  boost got the same accuracy as AdaBoost, even with different max depth parameter.

All estimators generally have very high accuracy, probably due to having a large training sample (75%, randomized).

With non-random 5 earlier years the accuracy drops to more realistic values (around 90%) across the board.

In [36]:
from statsmodels.tools.tools import add_constant
df=pd.read_excel("../input/data2.xlsx")
ldrop=[]
for i in range(len(df)):
    if df['Year'][i] not in [1996, 1997, 1998, 1999, 2000]:
        ldrop.append(i)
df=df.drop(index=ldrop)
df=df.drop(columns='Year')
x = list(df.columns.values)
x.remove('Country Name')
mapper = DataFrameMapper([(df[x].columns, PowerTransformer())])
df[x] = mapper.fit_transform(df[x])
df = add_constant(df)
df["LE > median"]=df["Life expectancy"].apply(lambda x: 1 if (x>(np.median(df["Life expectancy"]))) else 0)
X_train, Y_train = df.drop(labels=['Country Name', 'Life expectancy', 'LE > median'], axis = 1), df['LE > median']

df2=pd.read_excel("../input/data2.xlsx")
ldrop=[]
for i in range(len(df)):
    if df2['Year'][i] in [1996, 1997, 1998, 1999, 2000]:
        ldrop.append(i)
df2=df2.drop(index=ldrop)
df2=df2.drop(columns='Year')
x = list(df2.columns.values)
x.remove('Country Name')
mapper = DataFrameMapper([(df2[x].columns, PowerTransformer())])
df2[x] = mapper.fit_transform(df2[x])
df2 = add_constant(df2)
df2["LE > median"]=df2["Life expectancy"].apply(lambda x: 1 if (x>(np.median(df2["Life expectancy"]))) else 0)

X_test, Y_test = df2.drop(labels=['Country Name', 'Life expectancy', 'LE > median'], axis = 1), df2['LE > median']

In [45]:
model = XGBClassifier(n_estimators = 500, silent = True)
parameters = {'max_depth': range(1,15)}
XGB = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
XGB.fit(X_train, Y_train)
print("training acc " + str(XGB.score(X_train, Y_train)))
print("test acc " + str(XGB.score(X_test, Y_test)))
print(XGB.best_params_)

Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    6.3s


training acc 0.9688473520249221
test acc 0.8878085265519821
{'max_depth': 1}


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:    6.9s finished


In [41]:
model=LGBMClassifier(silent=True)
parameters = {'max_depth': range(1,25)}
LBoost = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
LBoost.fit(X_train, Y_train, eval_set =(X_test, Y_test))
print("training acc " + str(LBoost.score(X_train, Y_train)))
print("test acc " + str(LBoost.score(X_test, Y_test)))
print('')
print(LBoost.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  58 tasks      | elapsed:    1.6s


training acc 0.9299065420560748
test acc 0.887060583395662

{'max_depth': 1}


[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed:    1.9s finished


In [48]:
model=AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=100)
parameters = {'base_estimator__max_depth': range(1,10)}
ABC = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
ABC.fit(X_train, Y_train)
print("training acc " + str(ABC.score(X_train, Y_train)))
print("test acc " + str(ABC.score(X_test, Y_test)))
print('')
print(ABC.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    5.9s finished


training acc 1.0
test acc 0.9098728496634256

{'base_estimator__max_depth': 8}


In [49]:
model = CatBoostClassifier(iterations=50, silent = True)
parameters = {'depth': range(4,10)}
CBC = GridSearchCV(model, parameters, n_jobs=4, verbose=2)
CBC.fit(X_train, Y_train)
print("training acc " + str(CBC.score(X_train, Y_train)))
print("test acc " + str(CBC.score(X_test, Y_test)))
print('')
print(CBC.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed:   32.8s finished


training acc 0.9953271028037384
test acc 0.9042632759910246

{'depth': 6}
