In [243]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import graphviz
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn_pandas import DataFrameMapper
from sklearn.ensemble import VotingClassifier

pd.set_option('display.max_rows', 200)

In [244]:
df=pd.read_excel("../input/data2.xlsx")

Normalize features using Yeo-Johnson transformation, because df contains negative values. Other transformations (aside from quantile, which just makes anything normal, but is non-linear) had far less effect on outliers. Still the distribution is not normal, but it should be enough for classification algorithms.

In [245]:
x = list(df.columns.values)
x.remove('Country Name')
df2 = df.copy()
mapper = DataFrameMapper([(df2[x].columns, PowerTransformer())])
df2[x] = mapper.fit_transform(df2[x])

In [246]:
df2["LE > median"]=df2["Life expectancy"].apply(lambda x: 1 if (x>(np.median(df2["Life expectancy"]))) else 0)
X_train, X_test, Y_train, Y_test = train_test_split(df2.drop(labels=['Country Name', 'Life expectancy', 'LE > median'], axis = 1),
                                                    df2['LE > median'], test_size=0.25, random_state=17, shuffle=True)

Bagging (decision trees)

In [247]:
treeBag = BaggingClassifier(DecisionTreeClassifier(random_state=17),n_estimators=200, random_state=17)
treeBag.fit(X_train, Y_train)
print("training acc " + str(treeBag.score(X_train, Y_train)))
print("test acc " + str(treeBag.score(X_test, Y_test)))
print("F1-score " + str(f1_score(Y_test, treeBag.predict(X_test))))
print("ROC AUC " + str(roc_auc_score(Y_test, treeBag.predict(X_test))))
print("Matthews correlation coefficient " + str((matthews_corrcoef(Y_test, treeBag.predict(X_test))+1)/2))

training acc 1.0
test acc 0.9716713881019831
F1-score 0.9719887955182073
ROC AUC 0.9716383772721802
Matthews correlation coefficient 0.9716989500245858


The dataset is balanced and 0 and 1 have equal value, so all metrics give similar values. MCC is corrected because it has values [-1 ; +1]. The main accuracy metric for this dataset will be a simple accuracy score.

Random forest (with features importance)

In [248]:
rfc=RandomForestClassifier(n_estimators=200, random_state=17, n_jobs = -1)
rfc.fit(X_train, Y_train)
for name, importance in zip(X_train.columns, rfc.feature_importances_):
    print(name, importance)
print("")
print("training acc " + str(rfc.score(X_train, Y_train)))
print("test acc " + str(rfc.score(X_test, Y_test)))

Year 0.015958766493153923
Education exp. %GNI 0.027260969305166017
Adolescent fertility rate 0.11603164635956699
Age dependency ratio 0.17181650637266851
Agriculture v.add. %GDP 0.1181499905134848
Immunization, measles 0.0583507199596368
Mobile sub./population 0.03969628351164325
Rural population % 0.08819496964150982
Cereal tn/ha ln 0.08429634714432543
GDP US$ per capita ln 0.19144888282384825
GDP US$ ln 0.049793344949270395
Inflation 0.03900157292572559

training acc 1.0
test acc 0.9730878186968839


Linear classifier

In [249]:
log_c = SGDClassifier(loss='log', n_jobs=-1, random_state=17, tol = None, max_iter=1000)
log_c.fit(X_train, Y_train)
print("training acc " + str(log_c.score(X_train, Y_train)))
print("test acc " + str(log_c.score(X_test, Y_test)))

training acc 0.8761814744801513
test acc 0.886685552407932


In [250]:
knn = KNeighborsClassifier(n_jobs = -1)
knn.fit(X_train, Y_train)
print("training acc " + str(knn.score(X_train, Y_train)))
print("test acc " + str(knn.score(X_test, Y_test)))

training acc 0.9810964083175804
test acc 0.9546742209631728


In [251]:
knn_grid = GridSearchCV(KNeighborsClassifier(), param_grid = {'n_neighbors': range(1, 10)}, cv=5)
knn_grid.fit(X_train, Y_train)
print(knn_grid.best_params_)
print('training acc ' + str(knn_grid.best_score_))
print("test acc " + str(knn_grid.score(X_test, Y_test)))

{'n_neighbors': 1}
training acc 0.9725897920604915
test acc 0.9702549575070821


On 75% training 25% test:
Best result - random forest, worst - linear classifier, tried with different parameters, didnt manage to get above 0.87 acc.

KNN (k_neighbours = 1, all other parameters dont imporbe the result significantly) has shown the best result on 25% training 75% test dataset with the same parameters.

In [284]:
knn = KNeighborsClassifier(n_neighbors = 1, n_jobs = -1)
ens1 = VotingClassifier(estimators=[('RFor', rfc), ('KNN', knn), ('Log_c', log_c)], voting='hard', n_jobs = -1)
ens1.fit(X_train, Y_train)
print('training acc ' + str(ens1.score(X_train, Y_train)))
print("test acc " + str(ens1.score(X_test, Y_test)))

training acc 1.0
test acc 0.9759206798866855


The ensemble, despite being very simple, has managed to show better accuracy on the test set than each of the models individually.