### Predict Type 2 diabetes with genomics data II
by SVM, RandomForest, KNeighborsClassifier

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore") 

In [38]:
### Read in data
df = pd.read_csv('../data/top500pcaGene.csv')

In [4]:
df.head(3)

Unnamed: 0,t2d,5260,100873336,4555,1487,2641,4571,4519,11067,4577,...,93986,55089,26118,7072,837,286257,92255,1452,4832,10171
0,0,0.0,0.0,0.0,3.322937,10.815594,4.808866,7.037647,0.0,4.109481,...,0.0,5.075939,4.855219,1.78924,0.0,0.0,4.891742,5.58092,2.386135,0.0
1,0,0.0,3.400627,0.0,2.284452,11.641512,5.379671,7.540405,0.0,5.36514,...,0.0,2.40516,3.903509,3.27705,0.0,0.0,0.0,4.254837,0.0,5.214127
2,0,3.492573,2.642864,4.768249,2.348791,10.814974,5.679429,8.156457,0.0,4.753775,...,0.0,4.323108,4.587719,3.748767,0.0,0.0,0.0,5.637185,1.899896,1.480695


In [39]:
y = df['t2d'].values
X = df.drop(['t2d'], axis=1)

In [40]:
### Split train, test
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state=42)

### pipeline
model ensemble with GridSearchCV

In [45]:
def GridSearchCV_run(pipeline, train_x, train_y, test_x, test_y, param_grid, score = 'accuracy'):
    result = {}
    gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, scoring = score)
    # gridsearch split train to train and test for cv
    search = gridsearch.fit(train_x, train_y) 
    print("GridSearchCV best score： %0.3lf" % search.best_score_)
    print("GridSearchCV best params：", search.best_params_)
    
    predict_y = gridsearch.predict(test_x)
    print("accuracy %0.3lf" % accuracy_score(test_y, predict_y))
    result['predict_y'] = predict_y
    result['accuracy_score'] = accuracy_score(test_y, predict_y)
    
    return result
 

In [46]:
clf = [   
    RandomForestClassifier(random_state = 42, criterion = 'gini'),
    KNeighborsClassifier(metric = 'minkowski'),
    SVC(random_state = 42), ]

clf_names = [ 
            'random_forest',
            'kneighbor',
            'svc',]

clf_param = [
            {'random_forest__n_estimators':[1000], 'random_forest__max_features':[20]} ,
            {'kneighbor__n_neighbors':np.arange(1, 30)},
            {'svc__C':10.0 ** np.arange(-2, 2), 'svc__gamma':10.0 ** np.arange(-5, 1)},]

In [47]:
for model, model_name, model_param in zip(clf, clf_names, clf_param):
    pipeline = Pipeline([
            ('scaler', StandardScaler()),
            (model_name, model)
    ])
    result = GridSearchCV_run(pipeline, train_x, train_y, test_x, test_y, model_param , score = 'accuracy')


GridSearchCV best score： 0.860
GridSearchCV best params： {'random_forest__n_estimators': 1000, 'random_forest__max_features': 20}
accuracy 0.905
GridSearchCV best score： 0.742
GridSearchCV best params： {'kneighbor__n_neighbors': 9}
accuracy 0.779
GridSearchCV best score： 0.817
GridSearchCV best params： {'svc__gamma': 0.001, 'svc__C': 1.0}
accuracy 0.911


#### SVM method improved accuracy by normalization with StandardScaler