In [53]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, accuracy_score, roc_curve
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

import pandas as pd
import numpy as np

In [54]:
data_full = pd.read_csv('../B3_working.csv', low_memory = False)
print(data_full.shape)

(7775, 1190)


In [55]:
X = np.array(data_full.drop(labels = 'y', axis = 1))
y = data_full['y']
y_binary = np.zeros(len(y))
y_binary[y == 'BBB+'] = 1
print(X.shape)
print(y.shape)

(7775, 1189)
(7775,)


In [56]:
mm = MinMaxScaler()
mm.fit(X)
Xs = mm.fit_transform(X)
print(Xs.shape)

(7775, 1189)


In [57]:
rs = 42
X_train, X_test, y_train, y_test = train_test_split(Xs, y_binary, stratify=y_binary, train_size=0.9, random_state=rs)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6997, 1189)
(6997,)
(778, 1189)
(778,)


In [58]:
print(y_train.mean())
print(y_test.mean())

0.6344147491782193
0.6349614395886889


In [59]:
svm_lin = SVC(kernel='linear', probability=True)
svm_lin.fit(X_train, y_train)

SVC(kernel='linear', probability=True)

In [60]:
y_probs = svm_lin.predict_proba(X_test)
y_preds = svm_lin.predict(X_test)

In [61]:
print(y_preds)
print(y_probs)

[0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0.
 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0.
 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1.
 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1.
 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0.
 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0.
 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0.
 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0.

In [92]:
thresh_vals = np.arange(0.1, 0.91,0.01)

thresh = 0.3
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=rs)
f = []
tpr= []
fpr = []
for train_id, test_id in tqdm(skf.split(X_train, y_train)):
    X_train_k, X_test_k = X_train[train_id], X_train[test_id]
    y_train_k, y_test_k = y_train[train_id], y_train[test_id]
    
    svm_lin.fit(X_train_k, y_train_k)
    probs = svm_lin.predict_proba(X_test_k)
    y_pred = np.where(probs[:,0] > thresh, 0, 1)
    f.append(f1_score(y_test_k, y_pred))
    tn,fp,fn,tp = confusion_matrix(y_test_k, y_pred).ravel()
    tpr.append(tp/(tp+fn))
    fpr.append(fp/(fp+tn))
f = np.array(f)

5it [03:29, 41.96s/it]


In [93]:
f.mean()

0.8736689415966719

In [94]:
print(tpr)

[0.8457207207207207, 0.8502252252252253, 0.8344594594594594, 0.8614864864864865, 0.8410372040586246]


In [95]:
print(fpr)

[0.169921875, 0.158203125, 0.18590998043052837, 0.1643835616438356, 0.115234375]


In [106]:
res = np.array([f.mean()]+fpr+tpr)
print(len(res))

11


In [107]:
def f_kcv(thresh):
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=rs)
    f = []
    tpr = []
    fpr = []
    print('Computing f_score for threshold:{}'.format(thresh))
    for train_id, test_id in tqdm(skf.split(X_train, y_train)):
        X_train_k, X_test_k = X_train[train_id], X_train[test_id]
        y_train_k, y_test_k = y_train[train_id], y_train[test_id]

        svm_lin.fit(X_train_k, y_train_k)
        probs = svm_lin.predict_proba(X_test_k)
        y_pred = np.where(probs[:,0] > thresh, 0, 1)
        f.append(f1_score(y_test_k, y_pred))
        tn,fp,fn,tp = confusion_matrix(y_test_k, y_pred).ravel()
        tpr.append(tp/(tp+fn))
        fpr.append(fp/(fp+tn))
    f = np.array(f)
    return np.array([f.mean()] + fpr + tpr)

In [108]:
f_kcv(0.3)

Computing f_score for threshold:0.3


5it [03:19, 39.97s/it]


array([0.87349503, 0.16992188, 0.1640625 , 0.18590998, 0.16438356,
       0.11523438, 0.84346847, 0.85247748, 0.83445946, 0.86148649,
       0.8421646 ])

In [75]:
res = [f_kcv(t) for t in [0.3,0.4]]

Computing f_score for threshold:0.3


5it [04:27, 53.48s/it]


Computing f_score for threshold:0.4


5it [03:33, 42.72s/it]


In [76]:
res

[0.8739133365245163, 0.8870044322942391]

In [110]:
thresh_vals = np.arange(0.1,0.91,0.1)
len(thresh_vals)

9