In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score, precision_score
from scipy.sparse import save_npz, load_npz
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.svm import LinearSVC

In [31]:
df = pd.read_csv("../../data/diabetic_data.zip")

In [32]:
bow = load_npz("../../data/processed/bow.npz")

In [33]:
df['readmitted'] = df['readmitted'].apply(lambda x: 1 if x == "<30" else 0)
y = df['readmitted'].values
X = bow

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=33634)

In [35]:
for k in np.arange(50,600,50):
    pipeline = Pipeline([
        ("varthres", VarianceThreshold()),
        ("select", SelectKBest(f_classif, k=k)),
        ("clf", LinearSVC(class_weight="balanced"))
    ])
    pipeline.fit(X_train, y_train) 
    roc = roc_auc_score(y_test, pipeline.decision_function(X_test))
    recall = recall_score(y_test, pipeline.predict(X_test))
    precision = precision_score(y_test, pipeline.predict(X_test))
    print("k = {}, ROC: {:.3f}, recall: {:.3f}, precision: {:.3f}".format(k, roc, recall, precision))

k = 50, ROC: 0.582, recall: 0.499, precision: 0.135
k = 100, ROC: 0.589, recall: 0.535, precision: 0.136
k = 150, ROC: 0.589, recall: 0.544, precision: 0.135
k = 200, ROC: 0.589, recall: 0.545, precision: 0.136
k = 250, ROC: 0.589, recall: 0.559, precision: 0.135
k = 300, ROC: 0.589, recall: 0.555, precision: 0.135
k = 350, ROC: 0.591, recall: 0.566, precision: 0.136
k = 400, ROC: 0.591, recall: 0.566, precision: 0.135
k = 450, ROC: 0.589, recall: 0.563, precision: 0.134
k = 500, ROC: 0.589, recall: 0.564, precision: 0.134
k = 550, ROC: 0.589, recall: 0.563, precision: 0.134
