In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from PIL import Image

#importing specific functions
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale

In [6]:
#pre-define functions
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    #print("BEST", gs.best_params_, gs.best_score_, gs.grid_scores_)
    best = gs.best_estimator_
    return best

def do_classify_aa(clf, parameters, indf, featurenames, targetname, target1val, score_func=None, n_folds=5, n_jobs=1):
    subdf=indf[featurenames]
    X=subdf.values
    X=scale(X)
    y=(indf[targetname].values==target1val)*1
    training_accuracy = np.zeros(10)
    test_accuracy=np.zeros(10)
    test_auc=np.zeros(10)
    
    for idx,train_test in enumerate(skf.split(X,y)):
        X_train, X_test = X[train_test[0]], X[train_test[1]]
        y_train, y_test = y[train_test[0]], y[train_test[1]]
        
        if parameters:
            clf = cv_optimize(clf, parameters, X_train, y_train, n_jobs=n_jobs, n_folds=n_folds, score_func=score_func)
        clf=clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        probs = clf.predict_proba(X_test)
        training_accuracy[idx] = clf.score(X_train, y_train)
        test_accuracy[idx] = accuracy_score(y_test, pred)
        test_auc[idx] = roc_auc_score(y_test, probs[:,1])
        #print(idx)
    print("############# based on k-fold cross-validation predictions ################")
    print("Training Accuracy %0.2f +/- %0.3f" % (training_accuracy.mean(), training_accuracy.std()))
    #print(")
    print("***** Target : GBM vs METS")
    print(clf)
    print("Accuracy on test data:     %0.2f +/- %0.3f" % (test_accuracy.mean(), test_accuracy.std()))
    print("AUC on test data:     %0.2f +/- %0.3f" % (test_auc.mean(), test_auc.std()))

    #print(confusion_matrix(y, pred))
    print("########################################################")
    return clf, test_accuracy, test_auc

In [3]:
#read in the pre-extracted features
dfglioma=pd.read_csv("../data/glioma_all_featarray.csv")
dfglioma.head()
colswewant_cont = list(dfglioma)
colswewant_cont.pop()
Targets=['Targets']

In [7]:
# cross validation methods
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, random_state=2652124)

from sklearn.model_selection import train_test_split
#tts = train_test_split(n_splits=10, random_state=2652124)

In [8]:
from sklearn import svm
clfsvc = svm.SVC(probability=True)
parameters = [{'kernel': ['linear'], 'C': [1]}]
clfsvc, test_accuracy, test_auc  = do_classify_aa(clfsvc, parameters, dfglioma, colswewant_cont, 'Targets', 1)

############# based on k-fold cross-validation predictions ################
Training Accuracy 1.00 +/- 0.000
***** Target : GBM vs METS
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy on test data:     0.71 +/- 0.132
AUC on test data:     0.85 +/- 0.161
########################################################
