In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score,recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Data loading

Load features are used in SVMth_10

In [5]:
features_10 = pd.read_csv('Classifier_2014_CCR_Supplementary_Table4.csv')
features_10.head()

Unnamed: 0,Oligo-ID,Gene symbols,Entrez gene IDs,RefSeq IDs,Gene IDs,Transcript IDs
0,A_23_P3643,ABCA13,ABCA13,NM_152701.3,ENSG00000179869,ENST00000544596; ENST00000411975; ENST00000435803
1,A_32_P1701,ABCC5,ABCC5,NM_005688.2; NM_001023587.1,ENSG00000114770,ENST00000427120
2,A_23_P27688,ADM5,ADM5,NM_001101340.1,ENSG00000224420,ENST00000420022
3,A_23_P70359,AGPAT4,AGPAT4,NM_020133.2,ENSG00000026652,
4,A_32_P141768,AGPAT4,AGPAT4,NM_020133.2,ENSG00000026652,ENST00000366911; ENST00000320285


In [6]:
features = np.unique(features_10.loc[:, 'Oligo-ID'].dropna())

In [7]:
data = pd.read_csv('exprsNorm_NB_oneColor709.csv', index_col=0).T

In [8]:
data = data.loc[:, features]

In [9]:
data.to_csv('Prepared_data.csv')

In [10]:
target = pd.read_csv('target.csv', header=2, index_col='Array Barcode')
target = target.loc[:, ['Classification SVM_th10']]
target.head()

Unnamed: 0_level_0,Classification SVM_th10
Array Barcode,Unnamed: 1_level_1
252038210001_1_1,1.0
252038210001_1_2,1.0
252038210001_1_4,1.0
252038210002_1_1,0.0
252038210002_1_2,0.0


In [11]:
idx = target.loc[np.logical_not(np.isnan(target.values.T[0]))].index

In [12]:
X = data.loc[idx]
Y = target.loc[idx]

In [13]:
y = np.array(Y.values.T[0])

# Fitting of the model

In [14]:
std = StandardScaler()
clf = SVC(kernel='linear')
model = Pipeline([('std', std), ('svc', clf)])

In [15]:
def nested_cros_val(clf, X, y, param_grid, scoring, grid_cv, scores,
                    ext_n_folds=5, repeats=10, verbose=False):
    results = []
    
    for i in range(repeats):
        print('****** Repeat {} of {} ******'.format(i+1, repeats))
        cv = StratifiedKFold(n_splits=ext_n_folds, shuffle=True, random_state=i)
        
        for train_idx, test_idx in cv.split(X, y):
            result = []
            X_train, y_train = X[train_idx], y[train_idx]
            X_test, y_test = X[test_idx], y[test_idx]
            
            clf_gr = GridSearchCV(estimator=clf, param_grid=param_grid,
                                  scoring=scoring, cv=grid_cv, refit=True)
            clf_gr.fit(X_train, y_train)
            
            if verbose == True:
                print('Best params:')
                print(clf_gr.best_params_)
                print('Best score:')
                print(clf_gr.best_score_)
            
            y_pred = clf_gr.predict(X_test)
            for score in scores:
                result.append(score[1](y_test, y_pred))
            
            print('Test results:')
            print(result)
            results.append(result)
            print('---------------------------------------------------')
    
    for value, score in zip(zip(*results), scores):
        print('{}: %.3f +- %.3f'.format(score[0]) %(np.mean(value),
                                                    np.std(value)))

In [16]:
param_grid = {'svc__C': [0.3, 0.5, 1.0, 1.3, 1.8]}

In [18]:
nested_cros_val(model, X.values, y, param_grid=param_grid, scoring='accuracy',
                grid_cv=StratifiedKFold(n_splits=3),
                scores=[('accuracy', accuracy_score),
                        ('precision', precision_score),
                        ('recall', recall_score)],
                repeats=10)

****** Repeat 1 of 10 ******
Test results:
[0.97637795275590555, 0.97999999999999998, 0.96078431372549022]
---------------------------------------------------
Test results:
[0.96850393700787396, 0.92727272727272725, 1.0]
---------------------------------------------------
Test results:
[0.98425196850393704, 1.0, 0.96078431372549022]
---------------------------------------------------
Test results:
[0.99212598425196852, 1.0, 0.98039215686274506]
---------------------------------------------------
Test results:
[0.97619047619047616, 0.96153846153846156, 0.98039215686274506]
---------------------------------------------------
****** Repeat 2 of 10 ******
Test results:
[0.99212598425196852, 0.98076923076923073, 1.0]
---------------------------------------------------
Test results:
[0.98425196850393704, 0.98039215686274506, 0.98039215686274506]
---------------------------------------------------
Test results:
[0.96062992125984248, 0.97916666666666663, 0.92156862745098034]
------------------

Our results
- accuracy: 0.976 +- 0.013
- precision: 0.971 +- 0.022
- recall: 0.969 +- 0.022