In [67]:
# include necessary libraries
import numpy as np
import pandas as pd

In [68]:
# Read CSV
heart_df = pd.read_csv('Heart_s.csv')
heart_df[0::10]

Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,f,typical,145,233,2,150,2.3,fixed,No
10,57,f,asymptomatic,140,192,0,148,0.4,fixed,No
20,64,f,typical,110,211,2,144,1.8,normal,No
30,69,m,typical,140,239,0,151,1.8,normal,No
40,65,m,asymptomatic,150,225,2,114,1.0,reversable,Yes
50,41,m,nontypical,105,198,0,168,0.0,normal,No
60,51,m,asymptomatic,130,305,0,142,1.2,reversable,Yes
70,65,m,nonanginal,155,269,0,148,0.8,normal,No
80,45,f,asymptomatic,104,208,2,148,3.0,normal,No
90,62,m,asymptomatic,160,164,2,145,6.2,reversable,Yes


In [70]:
# Only use columns that use numerical data
numerical_fields = ['Age', 'RestBP', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak']

X = heart_df[numerical_fields]
y = heart_df['AHD']

In [72]:
# Import the 3 prediction algorithms to be used
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

k = 3
my_knn = KNeighborsClassifier(n_neighbors=k) 
my_logreg = LogisticRegression()
my_decisiontree = DecisionTreeClassifier()

In [73]:
# import splitting algorithm
from sklearn.model_selection import train_test_split

# Create the training and testing sets
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.25, random_state=5)

In [74]:
# fit the models with the training sets
my_logreg.fit(X_train, y_train)
my_knn.fit(X_train, y_train)
my_decisiontree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [75]:
# Make predictions for the testing set of data
y_predict_lr  = my_logreg.predict(X_test)
y_predict_knn = my_knn.predict(X_test)
y_predict_dt  = my_decisiontree.predict(X_test)

# print("y_predict_lr: ", y_predict_lr)
# print("y_predict_knn: ", y_predict_knn)
# print("y_predict_dt: ", y_predict_dt)

In [76]:
# Compare the predictions with the actual values of the test set
from sklearn.metrics import accuracy_score

score_lr  = accuracy_score(y_test, y_predict_lr)
score_knn = accuracy_score(y_test, y_predict_knn)
score_dt  = accuracy_score(y_test, y_predict_dt)

print("Logistic Regression: ", score_lr) 
print("KNeighbors Classifier: ", score_knn)
print("Decision Tree: ", score_dt)

Logistic Regression:  0.6973684210526315
KNeighbors Classifier:  0.7236842105263158
Decision Tree:  0.75


In [77]:
# Logistic Regression is the most accurate classifier
# KNN and DT are tied for worst.

In [78]:
#Function to seperate categorical features into binary columns
def seperate_categorical_features(dataset, field_column):
    unique_features = {} # define a dictionary that holds the names of the unique features 
                         # with a list filled with 1s/0s to represent if the row has that feature
    
    for i in field_column: # column of data for the categorical feature
        if i not in unique_features: # find all unique values for the category
            unique_features.setdefault(i, []) 
    
    # Print unique values
    print("Unique values of", field_column.name, ":", [key for key in unique_features.keys()])

    for i, row in dataset.iterrows():  # Create a n lists of 0 or 1 to represent the new columns
        for k,v in unique_features.items():
            v.append(1 if row[field_column.name] == k else 0)

    for k, v in unique_features.items():   # Assign the new lists to their appropriate columns
        dataset[k] = v
    
    return dataset

In [79]:
# Create new columns for Gender, ChestPain, and Thal distinction
heart_df = seperate_categorical_features(heart_df, heart_df.Gender)
heart_df = seperate_categorical_features(heart_df, heart_df.ChestPain)
heart_df = seperate_categorical_features(heart_df, heart_df.Thal)

heart_df[0::60]  # Print 5 values of the dataset to test new columns

Unique values of Gender : ['f', 'm']
Unique values of ChestPain : ['typical', 'asymptomatic', 'nonanginal', 'nontypical']
Unique values of Thal : ['fixed', 'normal', 'reversable']


Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD,f,m,typical,asymptomatic,nonanginal,nontypical,fixed,normal,reversable
0,63,f,typical,145,233,2,150,2.3,fixed,No,1,0,1,0,0,0,1,0,0
60,51,m,asymptomatic,130,305,0,142,1.2,reversable,Yes,0,1,0,1,0,0,0,0,1
120,63,m,asymptomatic,150,407,2,154,4.0,reversable,Yes,0,1,0,1,0,0,0,0,1
180,56,m,asymptomatic,134,409,2,150,1.9,reversable,Yes,0,1,0,1,0,0,0,0,1
240,41,m,nontypical,126,306,0,163,0.0,normal,No,0,1,0,0,0,1,0,1,0
300,38,f,nonanginal,138,175,0,173,0.0,normal,No,1,0,0,0,1,0,0,1,0


In [85]:
feature_names = (set(heart_df))  # get feature names

# Remove categorical features
feature_names.remove('ChestPain')
feature_names.remove('Gender')
feature_names.remove('Thal')

X = heart_df[list(feature_names)]
y = heart_df['AHD']

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.25, random_state=5)