In [105]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc, roc_auc_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [106]:
data = pd.DataFrame.from_csv("data/uci_heartdisease_dataset/cleveland_data_14.csv")
data = data.reset_index()

In [107]:
def convert_thallium_scan(num):
    if num == '3.0':
        return 0.0
    elif num == '6.0':
        return 1.0
    else:
        return 2.0
def convert_disease_status(num):
    if num > 0:
        return 1
    else:
        return 0

In [108]:
data = data[data.vessels_colored != '?']
data = data[data.thallium_scan != '?']

In [110]:
data['vessels_colored'] = data['vessels_colored'].apply(pd.to_numeric)
data['thallium_scan'] = data['thallium_scan'].apply(convert_thallium_scan)
data["disease_status"] = data["disease_status"].apply(convert_disease_status)

In [111]:
outcomes = data['disease_status'].values.flatten()
del data['disease_status']

In [112]:
print(len(data))
print(len(outcomes))

297
297


In [117]:
features = data.as_matrix()

In [114]:
# Vanilla Model Results for KFold(4)
# SVC - 0.53
# Logistic Regression - 0.83
# GP - 0.60
# DecisionTreeClassifier - 0.70
# RandomForestClassifier - 0.82
# MLPClassifier - .5784

In [167]:
kf = KFold(4)

scores = []
aucs = []

for train_index, test_index in kf.split(features):
    
    feature_train, feature_test = features[train_index], features[test_index]
    outcome_train, outcome_test = outcomes[train_index], outcomes[test_index]
    
#     print(feature_train, len(feature_train))
#     print(outcome_train, len(outcome_train))
#     print(len(outcome_test))
    
    clf = SVC()
#     clf = LogisticRegression()
#     clf = GaussianProcessClassifier()
#     clf = DecisionTreeClassifier()
#     clf = RandomForestClassifier()
#     clf = MLPClassifier()
    
    clf.fit(feature_train, outcome_train)
    
#     outcome_confidence_scores = [x[1] for x in clf.predict_proba(feature_test)]
    outcome_confidence_scores = [x for x in clf.decision_function(feature_test)]
    auc = roc_auc_score(outcome_test, outcome_confidence_scores)
    
    scores.append(clf.score(feature_test, outcome_test))
    aucs.append(auc)
    
print("Accuracy Average:", sum(scores) / len(scores))
print("AUC Average:", sum(aucs) / len(aucs))

Accuracy Average: 0.538783783784
AUC Average: 0.620082763224
