In [337]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [338]:
#allows plots to appear directly in the notebook
%matplotlib inline

In [339]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [340]:
#upload data
df = pd.read_csv(r"C:\Users\Sierra\Documents\heart-disease.csv")

In [341]:
#examine variables
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [342]:
#examine first few rows
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [343]:
#examine descriptive statistics
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [344]:
#percent count of null values
df.isnull().sum()/len(df)*100

age         0.0
sex         0.0
cp          0.0
trestbps    0.0
chol        0.0
fbs         0.0
restecg     0.0
thalach     0.0
exang       0.0
oldpeak     0.0
slope       0.0
ca          0.0
thal        0.0
target      0.0
dtype: float64

In [345]:
#create dummy variables for cp
cp = pd.get_dummies(df['cp'],drop_first=True)

In [346]:
#print first few rows of cp
cp.head()

Unnamed: 0,1,2,3
0,0,0,1
1,0,1,0
2,1,0,0
3,1,0,0
4,0,0,0


In [347]:
#rename cp columns
cp.rename(columns={1:'cp1',2:'cp2',3:'cp3'}, inplace = True)

In [348]:
#check 
cp.head()

Unnamed: 0,cp1,cp2,cp3
0,0,0,1
1,0,1,0
2,1,0,0
3,1,0,0
4,0,0,0


In [349]:
#create dummy variables for restecg
restecg = pd.get_dummies(df['restecg'],drop_first=True)

In [350]:
#print first few rows of restecg
restecg.head()

Unnamed: 0,1,2
0,0,0
1,1,0
2,0,0
3,1,0
4,1,0


In [351]:
#rename restecg columns
restecg.rename(columns={1:'restecg1',2:'restecg2'}, inplace = True)

In [352]:
#check
restecg.head()

Unnamed: 0,restecg1,restecg2
0,0,0
1,1,0
2,0,0
3,1,0
4,1,0


In [353]:
#create dummy variables for slope
slope = pd.get_dummies(df['slope'],drop_first=True)

In [354]:
#print first few rows of slope
slope.head()

Unnamed: 0,1,2
0,0,0
1,0,0
2,0,1
3,0,1
4,0,1


In [355]:
#rename slope columns
slope.rename(columns={1:'slope1',2:'slope2'}, inplace = True)

In [356]:
#check
slope.head()

Unnamed: 0,slope1,slope2
0,0,0
1,0,0
2,0,1
3,0,1
4,0,1


In [357]:
#create dummy variables for ca
ca = pd.get_dummies(df['ca'],drop_first=True)

In [358]:
#print first few rows of ca
ca.head()

Unnamed: 0,1,2,3,4
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [359]:
#rename ca columns
ca.rename(columns={1:'ca1',2:'ca2',3:'ca3',4:'ca4'}, inplace = True)

In [360]:
#check
ca.head()

Unnamed: 0,ca1,ca2,ca3,ca4
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [361]:
#create dummy variables for thal
thal = pd.get_dummies(df['thal'],drop_first=True)

In [362]:
#print first few rows of thal
thal.head()

Unnamed: 0,1,2,3
0,1,0,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [363]:
#rename thal columns
thal.rename(columns={1:'thal1',2:'thal2',3:'thal3'}, inplace = True)

In [364]:
#check
thal.head()

Unnamed: 0,thal1,thal2,thal3
0,1,0,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [365]:
#drop original variables for the dataset
df.drop(['cp','restecg','slope','ca','thal'], inplace = True, axis = 1)

In [366]:
#check for correlative relationships among numeric variables
df.corr()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target
age,1.0,-0.098447,0.279351,0.213678,0.121308,-0.398522,0.096801,0.210013,-0.225439
sex,-0.098447,1.0,-0.056769,-0.197912,0.045032,-0.04402,0.141664,0.096093,-0.280937
trestbps,0.279351,-0.056769,1.0,0.123174,0.177531,-0.046698,0.067616,0.193216,-0.144931
chol,0.213678,-0.197912,0.123174,1.0,0.013294,-0.00994,0.067023,0.053952,-0.085239
fbs,0.121308,0.045032,0.177531,0.013294,1.0,-0.008567,0.025665,0.005747,-0.028046
thalach,-0.398522,-0.04402,-0.046698,-0.00994,-0.008567,1.0,-0.378812,-0.344187,0.421741
exang,0.096801,0.141664,0.067616,0.067023,0.025665,-0.378812,1.0,0.288223,-0.436757
oldpeak,0.210013,0.096093,0.193216,0.053952,0.005747,-0.344187,0.288223,1.0,-0.430696
target,-0.225439,-0.280937,-0.144931,-0.085239,-0.028046,0.421741,-0.436757,-0.430696,1.0


In [367]:
#concatenate dummies to dataframe
df = pd.concat([df,cp,restecg,slope,ca,thal],axis=1)

In [368]:
#import
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [369]:
#scale data
StSc = StandardScaler()

In [370]:
#variables to be scaled
scaled_num_col = ['age','trestbps','chol','thalach','oldpeak']

In [371]:
#scale selected variables
df[scaled_num_col] = StSc.fit_transform(df[scaled_num_col])

In [372]:
#examine scaled data
df.describe()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target,cp1,cp2,cp3,restecg1,restecg2,slope1,slope2,ca1,ca2,ca3,ca4,thal1,thal2,thal3
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,5.825923e-17,0.683168,-7.146832e-16,-9.828955000000001e-17,0.148515,-5.203025e-16,0.326733,-3.140136e-16,0.544554,0.165017,0.287129,0.075908,0.50165,0.013201,0.462046,0.468647,0.214521,0.125413,0.066007,0.016502,0.059406,0.547855,0.386139
std,1.001654,0.466011,1.001654,1.001654,0.356198,1.001654,0.469794,1.001654,0.498835,0.371809,0.453171,0.265288,0.500824,0.114325,0.499382,0.499842,0.411169,0.331734,0.248704,0.127605,0.236774,0.498528,0.487668
min,-2.797624,0.0,-2.148802,-2.32416,0.0,-3.439267,0.0,-0.8968617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7572802,0.0,-0.6638668,-0.6814943,0.0,-0.7061105,0.0,-0.8968617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.06988599,1.0,-0.09273778,-0.1210553,0.0,0.1466343,0.0,-0.2067053,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.7316189,1.0,0.4783913,0.5456738,0.0,0.7151309,1.0,0.4834512,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2.49624,1.0,3.905165,6.140401,1.0,2.289429,1.0,4.451851,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [373]:
#logistic regression model

In [374]:
#independent variables
X = df.drop('target',axis=1)

In [375]:
#dependent variable
y = df['target']

In [376]:
#train/test split of the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [377]:
#import
from sklearn.linear_model import LogisticRegression

In [378]:
#logit model
logmodel = LogisticRegression()

In [379]:
#model fit
logmodel.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [380]:
#call predictions
predictions=logmodel.predict(X_test)

In [381]:
#classification report
print (classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.85      0.80      0.82        44
           1       0.82      0.87      0.85        47

    accuracy                           0.84        91
   macro avg       0.84      0.83      0.83        91
weighted avg       0.84      0.84      0.83        91



In [382]:
#compare distribution in data set to accuracy - looks good
df['target'].value_counts(normalize=True) * 100

1    54.455446
0    45.544554
Name: target, dtype: float64

In [383]:
#print confusio matrix - looks good
confusion_matrix (y_test,predictions)

array([[35,  9],
       [ 6, 41]], dtype=int64)

In [384]:
#decision tree model

In [385]:
#import
from sklearn.tree import DecisionTreeClassifier

In [386]:
#decision tree
dtree = DecisionTreeClassifier()

In [387]:
#model fit
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [388]:
#call predictions
predictions = dtree.predict(X_test)

In [389]:
#print classification report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.64      0.64      0.64        44
           1       0.66      0.66      0.66        47

    accuracy                           0.65        91
   macro avg       0.65      0.65      0.65        91
weighted avg       0.65      0.65      0.65        91



In [390]:
#print confusion matrix
print(confusion_matrix(y_test,predictions))

[[28 16]
 [16 31]]


In [391]:
#random forest model

In [392]:
#import
from sklearn.ensemble import RandomForestClassifier

In [393]:
#random forest model
rfc = RandomForestClassifier(n_estimators=300)

In [394]:
#model fit
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [395]:
#call predictions
predictions = rfc.predict(X_test)

In [396]:
#print classification report
#notice an improvement compared to decesion tree model
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.82      0.75      0.79        44
           1       0.78      0.85      0.82        47

    accuracy                           0.80        91
   macro avg       0.80      0.80      0.80        91
weighted avg       0.80      0.80      0.80        91



In [397]:
#print confusion matrix
print(confusion_matrix(y_test,predictions))

[[33 11]
 [ 7 40]]


In [None]:
#conclusion : logit model outperforms both models
#random forest improves decision tree model
#in terms of precision, recall, f1-score, and accuracy