## The Data: Pima Indian Classification

### References:

- Sebastian Perez - Machine Learning class
- http://lightgbm.readthedocs.io/en/latest/Parameters.html  
- http://scikit-learn.org/stable/index.html  
- https://xgboost.readthedocs.io/en/latest/parameter.html  

### Problem statement 
Using the UCI PIMA Indian Diabetes dataset to predict a person has diabetes or not using the medical attributes provided. (Target is column 8)

### Assumptions

This is enough data to split and reliably predict if the patient has diabetes, the dataset has only 786 data points
Just these attributes are enough to diagnose the ailment
Similar Problems 
This is very much like some common 2 class classification problems like classifying mail into spam and ham based on the contents of the email. Obviously the attributes there would be strings and not numbers like this dataset, therefore the way in which we process at least some of the features will be different.

In [159]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning) #hides deprecation issue as a consecuence of an update to np

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("./pima-indians-diabetes.data",header=None)

In [3]:
print(df.head())
print("Shape: ", df.shape)
print("Nulls: ", df.isnull().any().sum())

   0    1   2   3    4     5      6   7  8
0  6  148  72  35    0  33.6  0.627  50  1
1  1   85  66  29    0  26.6  0.351  31  0
2  8  183  64   0    0  23.3  0.672  32  1
3  1   89  66  23   94  28.1  0.167  21  0
4  0  137  40  35  168  43.1  2.288  33  1
Shape:  (768, 9)
Nulls:  0


In [5]:
X_train = df[np.arange(8)].values
y_train = df[8].values

## Solution

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [197]:
classifiers = [SVC(), KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),
               LogisticRegression(), LGBMClassifier(), XGBClassifier()]

for clf in classifiers:
    print(clf.__class__.__name__, cross_val_score(clf, X_train, y_train,
                                                  cv=5, scoring="accuracy",verbose=False).mean().round(4))

SVC 0.651
KNeighborsClassifier 0.724
DecisionTreeClassifier 0.7227
RandomForestClassifier 0.7305
LogisticRegression 0.7683
LGBMClassifier 0.7487
XGBClassifier 0.7657


In [180]:
clf_svc = SVC(random_state = 0)
param_grid_svc = {
    "C":[2],
#     "gamma":[1,0.1,0.001,0.0001],
    "kernel":["linear"] #rbf
}

clf_knn = KNeighborsClassifier()
param_grid_knn = {
   "n_neighbors":np.arange(1,15),
}

clf_tree = DecisionTreeClassifier(random_state=0)
param_grid_tree = {
   "min_samples_leaf":np.arange(5,50),
    "max_depth":np.arange(1,10)
}

clf_forest = RandomForestClassifier(random_state=0)
param_grid_forest = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": np.arange(10,100,10),
    "min_samples_leaf": np.arange(3,8),
#     "min_samples_split": [8, 10, 12],
}

clf_logit = LogisticRegression(random_state=0)
param_grid_logit = {
    "C":[1,10,100,1000],
}

clf_lgbm = LGBMClassifier()
param_grid_lgbm = {
    "learning_rate": [0.0065],#np.arange(0.005,0.5,0.05),
    "n_estimators": [300],
    "num_leaves": [15], #np.arange(10,40,5),
    "min_data_in_leaf": [30],
    "boosting_type" : ["gbdt"],
    "objective" : ["binary"],
    "seed":[0],
}

clf_xgb = XGBClassifier()
param_grid_xgb = {
#    "n_estimators":np.arange(50,300,50),
#    "learning_rate":np.arange(0.005,0.5,0.05),
#    "max_depth": np.arange(4,10),
    "nthread":[-1],
    "seed":[0]
}

In [207]:
clfs = [clf_svc, clf_knn, clf_tree, clf_forest, clf_logit, clf_lgbm, clf_xgb]
param_grids = [param_grid_svc, param_grid_knn, param_grid_tree, param_grid_forest, 
               param_grid_logit, param_grid_lgbm, param_grid_xgb]

def grid_cv(clf,param_grid):

    grid_search = GridSearchCV(clf, param_grid=param_grid,scoring = "accuracy", 
                           cv=5, n_jobs=-1) #verbose=2
    grid_search.fit(X_train, y_train)
    print(clf.__class__.__name__, grid_search.best_params_)
    print("Accuracy {}".format(grid_search.best_score_.round(3)))
    return(clf.__class__.__name__, grid_search.best_score_.round(3),grid_search.best_params_)

In [208]:
%%time

clf_scores = []
for clf, param_grid in zip(clfs, param_grids):
    clf_scores.append(grid_cv(clf, param_grid))

SVC {'C': 2, 'kernel': 'linear'}
Accuracy 0.767
KNeighborsClassifier {'n_neighbors': 14}
Accuracy 0.758
DecisionTreeClassifier {'max_depth': 7, 'min_samples_leaf': 22}
Accuracy 0.755
RandomForestClassifier {'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 300}
Accuracy 0.78
LogisticRegression {'C': 100}
Accuracy 0.772
LGBMClassifier {'boosting_type': 'gbdt', 'learning_rate': 0.0065, 'min_data_in_leaf': 30, 'n_estimators': 300, 'num_leaves': 15, 'objective': 'binary', 'seed': 0}
Accuracy 0.768
XGBClassifier {'nthread': -1, 'seed': 0}
Accuracy 0.766
CPU times: user 21.4 s, sys: 380 ms, total: 21.8 s
Wall time: 3min 35s


In [209]:
print(clf_scores)

[('SVC', 0.767, {'C': 2, 'kernel': 'linear'}), ('KNeighborsClassifier', 0.758, {'n_neighbors': 14}), ('DecisionTreeClassifier', 0.755, {'max_depth': 7, 'min_samples_leaf': 22}), ('RandomForestClassifier', 0.78, {'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 300}), ('LogisticRegression', 0.772, {'C': 100}), ('LGBMClassifier', 0.768, {'boosting_type': 'gbdt', 'learning_rate': 0.0065, 'min_data_in_leaf': 30, 'n_estimators': 300, 'num_leaves': 15, 'objective': 'binary', 'seed': 0}), ('XGBClassifier', 0.766, {'nthread': -1, 'seed': 0})]


In [258]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import KFold

In [259]:
params = [clf[2] for clf in clf_scores]
clf1 = SVC(**params[0])
clf2 = KNeighborsClassifier(**params[1])
clf3 = DecisionTreeClassifier(**params[2])
clf4 = RandomForestClassifier(**params[3])
clf5 = LogisticRegression(**params[4])
clf6 = LGBMClassifier(**params[5])
clf7 = XGBClassifier(**params[6])

In [260]:
estimators = [
    (clf1.__class__.__name__, clf1),
    (clf2.__class__.__name__, clf2),
    (clf3.__class__.__name__, clf3),
    (clf4.__class__.__name__, clf4),
    (clf5.__class__.__name__, clf5),
    (clf6.__class__.__name__, clf6),
    (clf7.__class__.__name__, clf7)
]

In [273]:
kfold = KFold(n_splits=5, random_state=2018)
ensemble = VotingClassifier(estimators, n_jobs=-1)
results = cross_val_score(ensemble, X_train,y_train, cv=kfold)
print("Accuracy of ensembled voting classifier: ",results.mean().round(4))

Accuracy of ensembled voting classifier:  0.7749
