<a href="https://colab.research.google.com/github/azhao20/Week1_Public/blob/master/support_vector_machine_tester_AZhao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""Creates and evaluates SVM models that predict whether a patient has hypothyroid disease."""
from itertools import combinations
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import graphviz
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, roc_auc_score


In [None]:
# Read the hypothyroid data into a Pandas dataframe 
url = "https://raw.githubusercontent.com/BeaverWorksMedlytics2020/Data_Public/master/ChallengeProjects/Week1/allhypo.train.data.csv"
dataset = pd.read_csv(url) 


In [None]:
# Clean up the dataset
# Delete an empty column
del dataset["TBG"]

# Replace "?"s with NaNs
dataset = dataset.replace("?", np.nan)


In [None]:
# Modify the data to enable model creation through sklearn's SVC
# Replace female entries with a zero and male entries with a one
dataset['Sex'].replace("F", 0, inplace=True)
dataset['Sex'].replace("M", 1, inplace=True)

# Replace false entries with a zero and true entries with a one
dataset.replace("f", 0, inplace=True)
dataset.replace("t", 1, inplace=True)

# Remove patient numbers from the data  
for patient in range(dataset['class'].shape[0]):
  dataset['class'][patient] = dataset['class'][patient].split(".")[0]

# Condense the target's unique values into binary data
dataset['class'].replace('negative', 0, inplace=True)
dataset['class'].replace(["compensated hypothyroid", "primary hypothyroid",  "secondary hypothyroid"], 1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [None]:
# Cast numerics to floats (enable modeling)
numerics = ["Age", "TSH", "T3", "TT4", "T4u", "FTI"]

for numeric in numerics:
  dataset[numeric] = dataset[numeric].astype(float)

  # Replace NaNs with their respective column's median
  dataset[numeric].fillna(dataset[numeric].median(), inplace=True)
  
# Drop remaining NaN entries from the dataset
dataset = dataset.dropna()


In [None]:
# Assemble model features (exclude 	referral source	and target "class")
features = list(dataset.columns)[:-2]
print(features)

# We'll create an 80/20 split of training and testing data
test_size = .2

# Controls the shuffling applied to the dataframe before applying a split
# We'll use a different seed for each model to avoid overfitting
seed = 69

dataset.head(20)


['Age', 'Sex', 'On thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'psych', 'TSH measured', 'TSH', 'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4u', 'FTI measured', 'FTI', 'TBG measured']


Unnamed: 0,Age,Sex,On thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,psych,TSH measured,TSH,T3 measured,T3,TT4 measured,TT4,T4U measured,T4u,FTI measured,FTI,TBG measured,referral source,class
0,41.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.3,1,2.5,1,125.0,1,1.14,1,109.0,0,SVHC,0
1,23.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4.1,1,2.0,1,102.0,0,0.98,0,107.0,0,other,0
2,46.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.98,0,2.0,1,109.0,1,0.91,1,120.0,0,other,0
3,70.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0.16,1,1.9,1,175.0,0,0.98,0,107.0,0,other,0
4,70.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.72,1,1.2,1,61.0,1,0.87,1,70.0,0,SVI,0
5,18.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0.03,0,2.0,1,183.0,1,1.3,1,141.0,0,other,0
6,59.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.4,0,2.0,1,72.0,1,0.92,1,78.0,0,other,0
7,80.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.2,1,0.6,1,80.0,1,0.7,1,115.0,0,SVI,0
8,66.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0.6,1,2.2,1,123.0,1,0.93,1,132.0,0,SVI,0
9,68.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.4,1,1.6,1,83.0,1,0.89,1,93.0,0,SVI,0


In [21]:
# The instructors asked for a "low amount of features" so...
# We'll only consider combinations of three atributes or less
# for num_features in range(1, 4):
  # Record every combination of features 
scaler=StandardScaler()
combos = [['On thyroxine', 'TSH'], ['On thyroxine', 'thyroid surgery', 'TSH'], ['On thyroxine', 'query hyperthyroid', 'TSH']]

#paras = [["TSH", "On thyroxine_f", "On thyroxine_t"], ["TSH", "On thyroxine_f", "On thyroxine_t", "thyroid surgery_f", "thyroid surgery_t"]]
#combo = ['TSH', 'FTI', 'TT4']
depths = [5, 10, 15]
leafs = [5, 10, 15]
estimators = [100, 200, 300]
estimator = 300
auc = []
#for para in paras:

numeric = list(dataset.columns)
numeric = numeric[:-2]

#norm_data = dataset
#for i in numerics:
    #feature_data = norm_data[i].values.reshape(-1, 1)
    #scaler.fit(feature_data)
    #norm_data[i] = scaler.transform(feature_data)

#for k, v in enumerate(numeric):
  #index = k
  #if index > 1 and index < 4:
val = list(combinations(numeric, 2))



for i in val:
  exploreFeature = []
  for ind in range(2):
    exploreFeature.append(i[ind])
  print(exploreFeature)
  highTest = 0
  highAuc = 0
  indTestL = None
  indAUCL = None
  indTestD = None
  indAUCD = None
  for depth in depths:
    #total_acc = 0
    #total_auc = 0
    for leaf in leafs:
      #for estimator in estimators:
        
      X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(dataset[exploreFeature], dataset['class'], test_size=0.2, random_state=1)
      
      random_forest = RandomForestClassifier(criterion = "entropy", max_depth = depth, min_samples_leaf= leaf, n_estimators=estimator)
      
      random_forest.fit(X_train_t, y_train_t)
      
      y_train_forest_pred=random_forest.predict(X_train_t)
      
      y_val_forest_pred=random_forest.predict(X_val_t)

      total_acc = accuracy_score(y_val_t, y_val_forest_pred)
      total_auc = roc_auc_score(y_val_t, y_val_forest_pred)

      if total_acc > highTest:
        highTest = total_acc
        indTestL = leaf
        indTestD = depth
        #print("Depth: ", depth, " Highest acc:", highTest, " at:", indTest, " Highest AUC:", highAuc, " at:", indAUC)
      if total_auc > highAuc:
        highAuc = total_auc
        indAUCL = leaf
        indAUCD = depth

        #print("Depth: ", depth, " Highest acc:", highTest, " at:", indTest, " Highest AUC:", highAuc, " at:", indAUC)
      #auc.append(roc_auc_score(y_val_t,y_val_forest_pred))
      #seed+=1 

    #print("Depth: ", depth, " Highest acc:", highTest, " at:", indTest, " Highest AUC:", highAuc, " at:", indAUC)
    #print("Depth switching!")
     
      #print("Combo:", exploreFeature, "Acc:", total_acc, "AUC:", total_auc)
      #print("Depth:", depth, " Leaf:", leaf)
  #print("For:", para, "depth:", depth, "leaf:", leaf, "estimator:", estimator)
  #print("Train accuracy:", accuracy_score(y_train_t, y_train_forest_pred)*100, 
            #"val:", accuracy_score(y_val_t,y_val_forest_pred)*100)
  #print("AUC is", roc_auc_score(y_val_t,y_val_forest_pred))
  print("Highest acc:", highTest, " at leaf:", indTestL, "and depth:", indTestD, " Highest AUC:", highAuc, " at leaf:", indAUCL, " and depth:", indAUCD)


['Age', 'Sex']


KeyboardInterrupt: ignored

In [None]:
# The instructors asked for a "low amount of features" so...
# We'll only consider combinations of three atributes or less
# for num_features in range(1, 4):
  # Record every combination of features 
combos = [['On thyroxine', 'TSH'], ['On thyroxine', 'thyroid surgery', 'TSH'], ['On thyroxine', 'query hyperthyroid', 'TSH']]

goodOnes = [['Age', 'TSH']] #Anything with TSH is good


#Leaf 5, depth 5

#paras = [["TSH", "On thyroxine_f", "On thyroxine_t"], ["TSH", "On thyroxine_f", "On thyroxine_t", "thyroid surgery_f", "thyroid surgery_t"]]
#combo = ['TSH', 'FTI', 'TT4']
depths = [3]
leafs = [2]
#estimators = [100, 200, 300]
estimator = 300
auc = []
#for para in paras:

numeric = list(dataset.columns)
numeric = numeric[:-2]

del numeric[16]
print(numeric)

norm_data = dataset
for i in numeric:
    feature_data = norm_data[i].values.reshape(-1, 1)
    scaler.fit(feature_data)
    norm_data[i] = scaler.transform(feature_data)

#for k, v in enumerate(numeric):
  #index = k
  #if index > 1 and index < 4:
val = list(combinations(numeric, 2))

category = ['TSH', 'On thyroxine', 'FTI']

highestTest = 0
highestAuc = 0
indTestL2 = None
indAUCL2 = None
indTestD2 = None
indAUCD2 = None

val = list(combinations(numeric, 2))
exploreFeature = ["TSH"]
for ind in range(2):
    exploreFeature.append(i[ind])

for i in val:
  exploreFeature = ["TSH"]
  for ind in range(2):
    exploreFeature.append(i[ind])
  print(exploreFeature)
  highTest = 0
  highAuc = 0
  indTestL = None
  indAUCL = None
  indTestD = None
  indAUCD = None
  for depth in depths:
    #total_acc = 0
    #total_auc = 0
    for leaf in leafs:
      #for j in range(50):
        
      X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(dataset[exploreFeature], dataset['class'], test_size=0.2, random_state=1)
      
      random_forest = RandomForestClassifier(criterion = "entropy", max_depth = depth, min_samples_leaf= leaf, n_estimators=estimator)
      
      #Runs the model
      random_forest.fit(X_train_t, y_train_t)
      
      y_train_forest_pred=random_forest.predict(X_train_t)
      
      y_val_forest_pred=random_forest.predict(X_val_t)

      #Calculates acc and auc
      total_acc = accuracy_score(y_val_t, y_val_forest_pred)
      total_auc = roc_auc_score(y_val_t, y_val_forest_pred)

      #Finds the highest per each leaf
      if total_acc > highTest:
        highTest = total_acc
        indTestL = leaf
        indTestD = depth
        if total_acc > highestTest:
          highestTest = total_acc
          indTestL2 = leaf
          indTestD2 = depth
      if total_auc > highAuc:
        highAuc = total_auc
        indAUCL = leaf
        indAUCD = depth
        if total_auc > highestAuc:
          highestAuc = total_auc
          indAUCL2 = leaf
          indAUCD2 = depth
      #step+=1

  #print("Highest acc:", highTest, " at leaf:", indTestL, "and depth:", indTestD, " Highest AUC:", highAuc, " at leaf:", indAUCL, " and depth:", indAUCD)
  print("Highest acc:", highTest," Highest AUC:", highAuc)

#print("End program acc:", highestTest, "at leaf:", indTestL2, "and depth:", indTestD2, " Highest AUC:", highestAuc, "at leaf:", indAUCL2, " and depth:", indAUCD2)
print("End program acc:", highestTest, " Highest AUC:", highestAuc)

['Age', 'Sex', 'On thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'psych', 'TSH measured', 'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4u', 'FTI measured', 'FTI', 'TBG measured']
['TSH', 'Age', 'Sex']
Highest acc: 0.9758364312267658  at leaf: 2 and depth: 3  Highest AUC: 0.9870259481037924  at leaf: 2  and depth: 3
['TSH', 'Age', 'On thyroxine']
Highest acc: 0.9869888475836431  at leaf: 2 and depth: 3  Highest AUC: 0.9679829530128932  at leaf: 2  and depth: 3
['TSH', 'Age', 'query on thyroxine']
Highest acc: 0.9776951672862454  at leaf: 2 and depth: 3  Highest AUC: 0.9880239520958084  at leaf: 2  and depth: 3
['TSH', 'Age', 'on antithyroid medication']
Highest acc: 0.9758364312267658  at leaf: 2 and depth: 3  Highest AUC: 0.9870259481037924  at leaf: 2  and depth: 3
['TSH', 'Age', 'sick']
Highest acc: 0.9758364312267658  at 

In [22]:
# The instructors asked for a "low amount of features" so...
# We'll only consider combinations of three atributes or less
# for num_features in range(1, 4):
  # Record every combination of features 
combos = [['On thyroxine', 'TSH'], ['On thyroxine', 'thyroid surgery', 'TSH'], ['On thyroxine', 'query hyperthyroid', 'TSH']]

goodOnes = [['Age', 'TSH']] #Anything with TSH is good


#Leaf 5, depth 5

#paras = [["TSH", "On thyroxine_f", "On thyroxine_t"], ["TSH", "On thyroxine_f", "On thyroxine_t", "thyroid surgery_f", "thyroid surgery_t"]]
#combo = ['TSH', 'FTI', 'TT4']
depths = [3]
leafs = [2]
#estimators = [100, 200, 300]
estimator = 300
auc = []
#for para in paras:

numeric = list(dataset.columns)
numeric = numeric[:-2]

del numeric[16]
print(numeric)

#norm_data = dataset
#for i in numeric:
    #feature_data = norm_data[i].values.reshape(-1, 1)
    #scaler.fit(feature_data)
    #norm_data[i] = scaler.transform(feature_data)

#for k, v in enumerate(numeric):
  #index = k
  #if index > 1 and index < 4:
val = list(combinations(numeric, 2))

exploreFeature = ['TSH', 'On thyroxine', 'FTI']

highestTest = 0
highestAuc = 0
indTestL2 = None
indAUCL2 = None
indTestD2 = None
indAUCD2 = None


exploreFeature = ['TSH', 'On thyroxine', 'FTI']
highTest = 0
highAuc = 0
indTestL = None
indAUCL = None
indTestD = None
indAUCD = None
step = 69

    

leaf = 2
depth = 3
total_acc = 0
total_auc = 0

for test in range(100):
  X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(dataset[exploreFeature], dataset['class'], test_size=0.2, random_state=step)

  random_forest = RandomForestClassifier(criterion = "entropy", max_depth = depth, min_samples_leaf= leaf, n_estimators=estimator)

  #Runs the model
  random_forest.fit(X_train_t, y_train_t)

  y_train_forest_pred=random_forest.predict(X_train_t)

  y_val_forest_pred=random_forest.predict(X_val_t)

  #Calculates acc and auc
  total_acc += accuracy_score(y_val_t, y_val_forest_pred)
  total_auc += roc_auc_score(y_val_t, y_val_forest_pred)

  step+=1
  print(total_acc, " ", total_auc)

#print("Highest acc:", highTest, " at leaf:", indTestL, "and depth:", indTestD, " Highest AUC:", highAuc, " at leaf:", indAUCL, " and depth:", indAUCD)
print("Highest acc:", total_acc/100.0," Highest AUC:", total_auc/100.0)


['Age', 'Sex', 'On thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'psych', 'TSH measured', 'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4u', 'FTI measured', 'FTI', 'TBG measured']
0.9944237918215614   0.9969574036511156
1.9851301115241635   1.9817894974081587
2.983271375464684   2.980783461190855
3.983271375464684   3.980783461190855
4.973977695167286   4.975628822015597
5.9646840148698885   5.970431316818092
6.962825278810409   6.9694353008818375
7.951672862453532   7.955465865859771
8.951672862453531   8.955465865859772
9.944237918215613   9.95131648826641
10.933085501858736   10.94529239188087
11.927509293680297   11.942310284525005
12.925650557620816   12.925643617858338
13.920074349442377   13.912968660108197
14.910780669144978   14.897300349091049
15.90148698884758   15.872035530519943
16.89776951672862   16.85912270409