In [36]:
import os
import numpy as np
import argparse
import pickle

from sklearn.model_selection import *
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_curve, auc as auc_score, confusion_matrix, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils import shuffle

In [37]:
import pandas as pd

In [38]:
def single_score(y_te, yhat):
    fpr, tpr, thresholds = roc_curve(y_te, yhat)
    roc_auc = auc_score(fpr, tpr)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    yhat[yhat>=optimal_threshold]=1; yhat[yhat<optimal_threshold]=0
    yhat=[int(i) for i in yhat]
    #matrix = confusion_matrix(y_te, yhat)
    tn, fp, fn, tp = confusion_matrix(y_te, yhat).ravel()
    sen=1.0* (tp/(tp+fn))
    spec=1.0* (tn/(tn+fp))
    f1=f1_score(y_te,yhat)
    return roc_auc, f1, sen, spec

In [39]:
X=np.load("X48.npy")
X

array([[4.22535211e-02, 0.00000000e+00, 5.43478261e-02, ...,
        4.03017024e-01, 1.33952979e-01, 4.75067826e-01],
       [2.58215962e-01, 1.18421053e-01, 3.26086957e-01, ...,
        4.10958588e-14, 0.00000000e+00, 0.00000000e+00],
       [2.11267606e-01, 4.05553814e-01, 8.69565217e-02, ...,
        2.52207581e-01, 1.88907108e-01, 2.31845699e-01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.10958588e-14, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.10526316e-01, 0.00000000e+00, ...,
        2.64952543e-01, 2.88610525e-01, 4.89848547e-02],
       [3.20522201e-01, 1.57894737e-01, 0.00000000e+00, ...,
        4.83805376e-02, 3.83338758e-02, 1.06946021e-01]])

In [40]:
with open('y', 'rb') as f:
    labels = pickle.load(f)
task = [yy[0] for yy in labels]
y= np.array(task)
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [41]:
skf = StratifiedKFold(n_splits=5)
count=0
data=[None]*5
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RF(n_estimators = 450, verbose = 1)
    xs, ys = shuffle(X_train, y_train)
    model = model.fit(xs, ys)
    yhat = model.predict(X_train)
    tr_auc, _, _, _= single_score(y_train, yhat)
    yhat2 = model.predict(X_test)
    te_auc, f1_scor, sen, spec= single_score(y_test, yhat2)
    data[count] = {'tr_auc': tr_auc, 'f1_score':f1_scor, 'te_auc': te_auc, 'sen':sen, 'spec': spec}
    count+=1

TRAIN: [ 4964  4971  4994 ... 27613 27614 27615] TEST: [   0    1    2 ... 5602 5603 5604]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:   51.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    0.3s finished


TRAIN: [    0     1     2 ... 27613 27614 27615] TEST: [ 4964  4971  4994 ... 11217 11218 11219]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:   54.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    0.4s finished


TRAIN: [    0     1     2 ... 27613 27614 27615] TEST: [ 9861  9876  9893 ... 16776 16777 16778]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:   53.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    0.4s finished


TRAIN: [    0     1     2 ... 27613 27614 27615] TEST: [14885 14892 14917 ... 22221 22222 22223]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:   53.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


TRAIN: [    0     1     2 ... 22221 22222 22223] TEST: [20978 20997 21005 ... 27613 27614 27615]


[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:   54.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    0.4s finished


In [42]:

data


[{'tr_auc': 1.0,
  'f1_score': 0.2345523329129887,
  'te_auc': 0.5663476849129844,
  'sen': 0.1411229135053111,
  'spec': 0.9915724563206577},
 {'tr_auc': 1.0,
  'f1_score': 0.2612035851472471,
  'te_auc': 0.5753493254028996,
  'sen': 0.15501519756838905,
  'spec': 0.99568345323741},
 {'tr_auc': 1.0,
  'f1_score': 0.2506265664160401,
  'te_auc': 0.5718634880900089,
  'sen': 0.15174506828528073,
  'spec': 0.9919819078947368},
 {'tr_auc': 1.0,
  'f1_score': 0.2562814070351759,
  'te_auc': 0.5737921229833879,
  'sen': 0.15477996965098634,
  'spec': 0.9928042763157895},
 {'tr_auc': 1.0,
  'f1_score': 0.2389937106918239,
  'te_auc': 0.5678642692776136,
  'sen': 0.1441578148710167,
  'spec': 0.9915707236842105}]

In [46]:
with open('raw_stats', 'wb') as f:
        pickle.dump(data, f)

In [48]:
pd.DataFrame(data).describe()

Unnamed: 0,tr_auc,f1_score,te_auc,sen,spec
count,5.0,5.0,5.0,5.0,5.0
mean,1.0,0.248332,0.571043,0.149364,0.992723
std,0.0,0.011305,0.003838,0.006363,0.00173
min,1.0,0.234552,0.566348,0.141123,0.991571
25%,1.0,0.238994,0.567864,0.144158,0.991572
50%,1.0,0.250627,0.571863,0.151745,0.991982
75%,1.0,0.256281,0.573792,0.15478,0.992804
max,1.0,0.261204,0.575349,0.155015,0.995683
