<a href="https://colab.research.google.com/github/azhao20/Week1_Public/blob/master/support_vector_machine_tester_AZhao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""Creates and evaluates SVM models that predict whether a patient has hypothyroid disease."""
from itertools import combinations
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 

import matplotlib as plt 

In [None]:
# Read the hypothyroid data into a Pandas dataframe 
url = "https://raw.githubusercontent.com/BeaverWorksMedlytics2020/Data_Public/master/ChallengeProjects/Week1/allhypo.train.data.csv"
dataset = pd.read_csv(url) 


In [None]:
# Clean up the dataset
# Delete an empty column
del dataset["TBG"]

# Replace "?"s with NaNs
dataset = dataset.replace("?", np.nan)


In [None]:
# Modify the data to enable model creation through sklearn's SVC
# Replace female entries with a zero and male entries with a one
dataset['Sex'].replace("F", 0, inplace=True)
dataset['Sex'].replace("M", 1, inplace=True)

# Replace false entries with a zero and true entries with a one
dataset.replace("f", 0, inplace=True)
dataset.replace("t", 1, inplace=True)

# Remove patient numbers from the data  
for patient in range(dataset['class'].shape[0]):
  dataset['class'][patient] = dataset['class'][patient].split(".")[0]

# Condense the target's unique values into binary data
dataset['class'].replace('negative', 0, inplace=True)
dataset['class'].replace("compensated hypothyroid","primary hypothyroid","secondary hypothyroid", 1, inplace=True)


AttributeError: ignored

In [None]:
# Cast numerics to floats (enable modeling)
numerics = ["Age", "TSH", "T3", "TT4", "T4u", "FTI"]

for numeric in numerics:
  dataset[numeric] = dataset[numeric].astype(float)

  # Replace NaNs with their respective column's median
  dataset[numeric].fillna(dataset[numeric].median(), inplace=True)
  
# Drop remaining NaN entries from the dataset
dataset = dataset.dropna()


In [None]:
# Assemble model features (exclude 	referral source	and target "class")
features = list(dataset.columns)[:-2]
print(features)

# We'll create an 80/20 split of training and testing data
test_size = .2

# Controls the shuffling applied to the dataframe before applying a split
# We'll use a different seed for each model to avoid overfitting
seed = 69


['Age', 'Sex', 'On thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'psych', 'TSH measured', 'TSH', 'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4u', 'FTI measured', 'FTI', 'TBG measured']


In [None]:
# The instructors asked for a "low amount of features" so...
# We'll only consider combinations of three atributes or less
# for num_features in range(1, 4):
  # Record every combination of features 
combos = [['On thyroxine', 'TSH']]

# Split the dataset between training data (80%) and testing data (20%)
X_train, X_test, y_train, y_test = train_test_split(dataset[combos[0]], dataset["class"], test_size=test_size, random_state=seed)

# Create a model with sklearn's SVC
svm = SVC(gamma=.5, C=.5)

# Train and fit the model with training data
svm.fit(X_train, y_train)

# Use the model to make a test prediction
y_test_pred = svm.predict(X_test)

In [None]:
from sklearn import metrics

labels = [0,1]
y_val_prob = svm.predict_proba(X_val,dataset.columns)

# Output predicted and true values for the first validation point
print('Probabilities:\n',y_val_prob[0])
print('\nTrue Value:\n',y_val.values[0])

print(y_test_pred)
print(list(y_test.array))

# Calculate the FPR and TPR at varying thresholds (assume label 1 is the "postive" class)
fpr, tpr, thresholds = metrics.roc_curve()

# Calculate the area under the ROC curve
#roc_auc = metrics.auc(fpr, tpr)

# Plot the ROC curve
#plt.figure(figsize=(6,6))
##lw = 2
#plt.plot(fpr, tpr, color='darkorange',
         #lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
#plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
#plt.xlim([0.0, 1.0])
#plt.ylim([0.0, 1.0])
#plt.xlabel('False Positive Rate')
#plt.ylabel('True Positive Rate')
#plt.title('Receiver operating characteristic')
#plt.legend(loc="lower right")
#plt.show()

AttributeError: ignored