In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import RepeatedStratifiedKFold

In [83]:
data1 = pd.read_csv("../TranscriptomicsTargets.csv")
data1.set_index('Gene', inplace=True)
X1 = data1.iloc[:,2:10]
Y1 = data1.iloc[:,0:1]

data2 = pd.read_csv("../IntegratedTargets.csv")
data2.set_index('Gene', inplace=True)
X2 = data2.iloc[:,2:12]
Y2 = data2.iloc[:,0:1]

normalizer = MinMaxScaler()
standardizer = StandardScaler()
X2_stand = standardizer.fit_transform(X2)
X2_normal = normalizer.fit_transform(X2_stand)

In [123]:
model_1 = LogisticRegression()
model_2 = SVC(kernel='linear')
model_3 = RandomForestClassifier(n_estimators=100, random_state=42)
model_4 = xgb.XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, min_child_weight=2, objective="binary:logistic", 
                            subsample=0.9, colsample_bytree=0.8, seed=23333)

cv1 = LeaveOneOut()
cv2 = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)

scoring = 'accuracy'
n_jobs = -1
error_score = 'raise'

In [145]:
models = [model_1, model_2, model_3, model_4]
cvs = [cv1, cv2]
tr_scores_mean = []
tr_scores_std = []
i_scores_mean = []
i_scores_std = []

for x in models:
    for y in cvs:
        score1 = cross_val_score(x, X1, Y1, scoring=scoring, cv=y, n_jobs=n_jobs, error_score=error_score)
        score2 = cross_val_score(x, X2_normal, Y2, scoring=scoring, cv=y, n_jobs=n_jobs, error_score=error_score)
        tr_scores_mean.append(np.mean(score1))
        tr_scores_std.append(np.std(score1))
        i_scores_mean.append(np.mean(score2))
        i_scores_std.append(np.std(score2))

In [147]:
methods = ['LogisticRegression', 'LinearSVM', 'RandomForest', 'XGBoost']
LOOCV1 = []
LOOCV2 = []
KFoldCV1 = []
KFoldCV2 = []
for i in range(0,8):
    if(i%2==0):
        LOOCV1.append('%.3f (%.3f)' % (tr_scores_mean[i], tr_scores_std[i]))
        LOOCV2.append('%.3f (%.3f)' % (i_scores_mean[i], i_scores_std[i]))
    else:
        KFoldCV1.append('%.3f (%.3f)' % (tr_scores_mean[i], tr_scores_std[i]))
        KFoldCV2.append('%.3f (%.3f)' % (i_scores_mean[i], i_scores_std[i]))

trans = pd.DataFrame({'Model':methods, 'LeaveOneOut Accuracy':LOOCV1, 'KFold Accuracy':KFoldCV1})
integ = pd.DataFrame({'Model':methods, 'LeaveOneOut Accuracy':LOOCV2, 'KFold Accuracy':KFoldCV2})

In [149]:
trans

Unnamed: 0,Model,LeaveOneOut Accuracy,KFold Accuracy
0,LogisticRegression,0.658 (0.475),0.646 (0.182)
1,LinearSVM,0.671 (0.470),0.669 (0.194)
2,RandomForest,0.753 (0.431),0.753 (0.171)
3,XGBoost,0.658 (0.475),0.673 (0.162)


In [151]:
integ

Unnamed: 0,Model,LeaveOneOut Accuracy,KFold Accuracy
0,LogisticRegression,0.767 (0.423),0.769 (0.153)
1,LinearSVM,0.781 (0.414),0.777 (0.156)
2,RandomForest,0.808 (0.394),0.783 (0.146)
3,XGBoost,0.781 (0.414),0.775 (0.136)
