<a href="https://colab.research.google.com/github/adichiara/cs548/blob/master/cs548_p2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import scipy as stats
import scipy.stats as norm
from statistics import mean

import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree.export import export_text
from sklearn import metrics


#Data Preprocessing

In [0]:
# import dataset
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', sep=",", header=None) 

In [0]:
# set column headers
column_names = ["age", "sex", "cp", "trestbps", "chol", 
              "fbs", "restecg", "thalach", "exang", 
              "oldpeak", "slope", "ca", "thal", "num"]
df.columns = column_names


In [0]:
# convert "?" values to NaN
df = df.replace("?", np.nan)

# convert numeric string values to float
df.thal = pd.to_numeric(df.thal, errors='coerce')
df.ca = pd.to_numeric(df.ca, errors='coerce')

# recode target variable - multiple true values to single true value
df.num.replace(to_replace=[1,2,3,4], value=1, inplace=True)


In [5]:
pd.isna(df).sum()
  

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [0]:
#drop all observations rows with NaN data
df = df.dropna()
#re-index the dataframe so that there are no missing row index numbers.  
df = df.reset_index(drop=True)

In [0]:
# recode categorical variables with actual attribute descriptors

df_names = df.copy()

sex_dict = {1:"male",0:"female"}
cp_dict = {1:"typical angina",2:"atypical angina",3:"non-anginal pain",4:"asymptomatic"}
fbs_dict = {1:"true",0:"false"}
restecg_dict = {0:"normal",1:"ST-T wave abnormality",2:"left ventricular hypertrophy"}
exang_dict = {1:"yes",0:"no"}
slope_dict = {1:"upsloping",2:"flat",3:"downsloping"}
ca_dict = {0:"0 vessels",1:"1 vessels",2:"2 vessels",3:"3 vessels"}
thal_dict = {3:"normal",6:"fixed defect",7:"reversable defect"}
num_dict = {0:"<50% diameter narrowing",1:">50% diameter narrowing"}

df_names.sex.replace(to_replace=sex_dict, inplace=True)
df_names.cp.replace(to_replace=cp_dict, inplace=True) 
df_names.fbs.replace(to_replace=fbs_dict, inplace=True) 
df_names.restecg.replace(to_replace=restecg_dict, inplace=True) 
df_names.exang.replace(to_replace=exang_dict, inplace=True) 
df_names.slope.replace(to_replace=slope_dict, inplace=True) 
df_names.ca.replace(to_replace=ca_dict, inplace=True) 
df_names.thal.replace(to_replace=thal_dict, inplace=True) 
df_names.num.replace(to_replace=num_dict, inplace=True) 


# Dataset Exploration

In [0]:
vars_discrete = df_names.select_dtypes(include=['object']).columns

vars_continuous = ["age","trestbps","chol","thalach","oldpeak"]


In [0]:
df.shape

In [0]:
df.describe()

In [0]:
# plot value frequencies for discrete variables

plt.rcParams['figure.figsize'] = [4,2]


for i, col in enumerate(vars_discrete):
  plt.figure(i) 
  ax = sns.countplot(y=col, data=df_names)


In [0]:
n=len(vars_discrete)

fig,ax = plt.subplots(n,1, figsize=(5,n*2), sharex=False)
for i in range(n):
    plt.sca(ax[i])
    col = vars_discrete[i]
    sns.countplot(y=df_names[col].values, hue=df_names.num)
    plt.xlabel("")
    plt.title(col)
    plt.subplots_adjust(top=1.5)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
 

In [0]:

sns.pairplot(df_names, vars=vars_continuous, hue="num", height=1.5, plot_kws={"s":25, "alpha":.3} )



In [0]:
df[vars_discrete].corr()


In [0]:
plt.rcParams['figure.figsize'] = [10, 5]

df_true=df.loc[df.num==1,:]
df_false=df.loc[df.num==0,:]

# correlation heatmap
sns.heatmap(df_true[vars_discrete].corr(), 
            vmin=-1, vmax=1, center=0, 
            annot=True, fmt=".2f", linewidth=0.5,
            cmap="coolwarm_r")



In [0]:
# correlation heatmap
sns.heatmap(df_false[vars_discrete].corr(), 
            vmin=-1, vmax=1, center=0, 
            annot=True, fmt=".2f", linewidth=0.5,
            cmap="coolwarm_r")

# Classification Experiments

In [0]:
df_data = df.iloc[:,0:13]
target = df["num"]


In [0]:

def get_gini(x, a, target):
  partitions = x[a].unique()
  target_classes = x[target].unique()
  n = x.loc[:,target].count()
  
  gini={}
  gini_node={}

  for i,b in enumerate(partitions):
    print("i ",i)
    print("b ",b)
    for j,c in enumerate(target_classes):
      print("j ",j)
      print("c ",c)

      gini_node[i,0]= (len(x[(x[a]==i) & (x[target]==c)])/n)**2
  
  #gini[a] = (1-gini_node[partitions[0],0]-gini_node[partitions[0],1])
  gini = (1-gini_node[partitions[0],0]-gini_node[partitions[0],1])

  print(gini)
  return(gini)

get_gini(df_names, "sex", "num")


In [0]:
class_functions={"zeroR":"zeroR"}

In [0]:
# zeroR benchmarking function

def zeroR_class(x_train, x_test, y_train, y_test):
    
  pred = y_train.value_counts().index[0]  # find most common class 
  not_pred = y_train.unique()[~pred]  

  n = len(y_test)

  # confusion matrix
  a = sum(y_test==pred)      # true positive
  b = sum(y_test==not_pred)  # false negative
  c = sum(y_test!=pred)      # false positive
  d = sum(y_test!=not_pred)  # true negative
  
  # performance stats
  accuracy = a/n
  error = 1-accuracy
  precision = a/(a+c)
  recall = a/(a+b)
  
  perf = {"accuracy": accuracy, "error":error, "precision":precision, "recall":recall}
                              
  return(perf)  
  
 

In [0]:
# oneR benchmarking function

def oneR_class(x_train, x_test, y_train, y_test):
  
  perf = pd.DataFrame()
  
  for i,col in enumerate(x_train):
    
    
  pred = y_train.value_counts().index[0]  # find most common class 
  not_pred = y_train.unique()[~pred]  

  n = len(y_test)

  a = sum(y_test==pred)
  b = sum(y_test==not_pred)
  c = sum(y_test!=pred)
  d = sum(y_test!=not_pred)
  
  accuracy = a/n
  error = 1-accuracy
  precision = a/(a+c)
  recall = a/(a+b)
  
  perf = {"accuracy": accuracy, "error":error, "precision":precision, "recall":recall}
               
  return(perf)  
  

In [0]:
# 10 fold CV splitting function
def run_10fold_cv(alg, x,y):

  accuracy_list=[]
  error_list=[]
  precision_list=[]
  recall_list=[]
  
  # set kfold split generator
  kf = KFold(n_splits=10, shuffle=True)

  # iterate through folds
  for train, test in kf.split(x):
    
    # run specified classification alg on specified data
    scores = globals()[alg](x.iloc[train,:], # training data
                            x.iloc[test,:],  # test data
                            y[train],        # training class
                            y[test])         # test class
    
    # store performance metrics for each iteration
    accuracy_list.append(scores["accuracy"])
    error_list.append(scores["error"])
    precision_list.append(scores["precision"])
    recall_list.append(scores["recall"])
    
  #print(accuracy_list)
  #print(error_list)
  #print(precision_list)
  #print(recall_list)

  # return dictionary of mean scores
  return({"accuracy":mean(accuracy_list),
          "error":mean(error_list),
          "precision":mean(precision_list),
          "recall":mean(recall_list)}
        )
          
         


In [387]:
zeroR = run_10fold_cv("zeroR_class", df_data, target)
zeroR


[0.5483870967741935, 0.6451612903225806, 0.3548387096774194, 0.6333333333333333, 0.6, 0.6333333333333333, 0.4666666666666667, 0.5666666666666667, 0.5333333333333333, 0.43333333333333335]
[0.4516129032258065, 0.3548387096774194, 0.6451612903225806, 0.3666666666666667, 0.4, 0.3666666666666667, 0.5333333333333333, 0.43333333333333335, 0.4666666666666667, 0.5666666666666667]
[0.5483870967741935, 0.6451612903225806, 0.3548387096774194, 0.6333333333333333, 0.6, 0.6333333333333333, 0.4666666666666667, 0.5666666666666667, 0.5333333333333333, 0.43333333333333335]
[0.5483870967741935, 0.6451612903225806, 0.3548387096774194, 0.6333333333333333, 0.6, 0.5, 0.4666666666666667, 0.5666666666666667, 0.5333333333333333, 0.43333333333333335]


{'accuracy': 0.541505376344086,
 'error': 0.458494623655914,
 'precision': 0.541505376344086,
 'recall': 0.5281720430107527}

In [388]:
zeroR = run_10fold_cv("zeroR_class", df_data**100, target)
zeroR


[0.6129032258064516, 0.6451612903225806, 0.3548387096774194, 0.5666666666666667, 0.5, 0.7333333333333333, 0.43333333333333335, 0.6, 0.5, 0.4666666666666667]
[0.3870967741935484, 0.3548387096774194, 0.6451612903225806, 0.43333333333333335, 0.5, 0.2666666666666667, 0.5666666666666667, 0.4, 0.5, 0.5333333333333333]
[0.6129032258064516, 0.6451612903225806, 0.3548387096774194, 0.5666666666666667, 0.5, 0.7333333333333333, 0.43333333333333335, 0.6, 0.5, 0.4666666666666667]
[0.6129032258064516, 0.6451612903225806, 0.3548387096774194, 0.5666666666666667, 0.5, 0.7333333333333333, 0.43333333333333335, 0.6, 0.5, 0.4666666666666667]


{'accuracy': 0.5412903225806451,
 'error': 0.45870967741935487,
 'precision': 0.5412903225806451,
 'recall': 0.5412903225806451}

In [0]:
zeroR

In [0]:
# oneR 

oneR = pd.DataFrame(columns=["variable","frequency","class","error"])

discrete_stats=pd.DataFrame()

for i,val in enumerate(vars_discrete):
  discrete_stats.loc[i,"variable"] = val
  discrete_stats.loc[i,"min"] = min(df[vars_discrete[i]])
  discrete_stats.loc[i,"max"] = max(df[vars_discrete[i]])
  discrete_stats.loc[i,"unique"] = len(df[vars_discrete[i]].unique())
  discrete_stats.loc[i,"class"]= 
  

discrete_stats

#for i, col in enumerate(df_names.columns):
#  oneR.variable[i]=col
  


In [0]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

In [0]:
clf = tree.DecisionTreeClassifier()
clf=clf.fit(df.iloc[:,0:13],df.num)
tree_plot = tree.plot_tree(clf)


In [0]:
export_text(clf)

In [0]:
df_data = df.loc[:,]

In [0]:

def tree_experiment(x,y): 
  clf=DecisionTreeClassifier()
  clf = clf.fit(x_train, y_train)

  y_pred = clf.predict(x_test)

  return(metrics.accuracy_score(y_test, y_pred))



In [0]:
print("accuracy: ", round(metrics.accuracy_score(y_test, y_pred),4))
print("tree depth: ",clf.get_depth())
print("n leaves: ",clf.get_n_leaves())
print("n features: ", clf.n_features_)

In [0]:



scores = []

scores.append(cross_val_score(clf, df_data, target, cv=10).mean)

scores

In [0]:
df.shape[1]

In [0]:
performance_df = pd.DataFrame(columns=["technique","parameter","accuracy"])






  

In [0]:
score=[]

#for n_features in range(1,df.shape[1]):
for n_features in range(1,100):
  clf=DecisionTreeClassifier(max_depth=n_features, criterion="entropy")
  score.append(cross_val_score(clf, df_data, target, cv=10).mean())

plt.hist(score)


# process 

In [0]:
# measure model accuracy function

# classification:
  # accuracy
  # precision
  # recall
  # ROC Area under the Curve
  # confusion matrix
  
# regression
  # correlation coefficient
  # residual standard error

# misc
  # size of tree
  # readability of tree
  # time

In [0]:
# zeroR

In [0]:
# oneR

In [0]:
# k folds cross validation

In [0]:
# fit model

In [0]:
# plot results