In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [152]:
df = pd.read_csv("Heart.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [153]:
df.columns

Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')

Clarifications about Dataset
 - Sex: (1 = male; 0 = female) 
 - fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
 - restecg: (2 =  showing probable or definite left ventricular hypertrophy by Estes' criteria ; 1 = having ST-T wave abnormality; 0 = Normal)
 - oldpeak = ST depression induced by exercise relative to rest 
 - exang: exercise induced angina (1 = yes; 0 = no) 
 - slope: the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
 - ca: number of major vessels (0-3) colored by flourosopy 

In [154]:
X = df.iloc[:,1:-1]
X

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable
299,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable
300,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable
301,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal


In [155]:
X.describe()

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0
mean,54.438944,0.679868,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241
std,9.038662,0.467299,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438
min,29.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0
max,77.0,1.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0


Pre-processing that needs to be done
- Standardize: Age, RestBP, MaxHR, Chol
- Normalize: Oldpeak
- OneHotEncode: ChestPain, RestECG, Slope, Ca, Thal

That is Done: Sex, Fbs, ExAng

In [156]:
cp_enc = pd.get_dummies(X['ChestPain'], drop_first=True, prefix='ChestPain')
rest_enc = pd.get_dummies(X['RestECG'], drop_first=True, prefix="RestECG")
slope_enc = pd.get_dummies(X['Slope'], drop_first=True, prefix='Slope')
ca_enc = pd.get_dummies(X['Ca'], drop_first=True, prefix='Ca')
thal_enc = pd.get_dummies(X['Thal'], drop_first=True, prefix='Thal')
X = X.join(cp_enc)
X = X.join(rest_enc)
X = X.join(slope_enc)
X = X.join(ca_enc)
X = X.join(thal_enc)

In [157]:
X

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,...,ChestPain_typical,RestECG_1,RestECG_2,Slope_2,Slope_3,Ca_1.0,Ca_2.0,Ca_3.0,Thal_normal,Thal_reversable
0,63,1,typical,145,233,1,2,150,0,2.3,...,1,0,1,0,1,0,0,0,0,0
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,...,0,0,1,1,0,0,0,1,1,0
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,...,0,0,1,1,0,0,1,0,0,1
3,37,1,nonanginal,130,250,0,0,187,0,3.5,...,0,0,0,0,1,0,0,0,1,0
4,41,0,nontypical,130,204,0,2,172,0,1.4,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,typical,110,264,0,0,132,0,1.2,...,1,0,0,1,0,0,0,0,0,1
299,68,1,asymptomatic,144,193,1,0,141,0,3.4,...,0,0,0,1,0,0,1,0,0,1
300,57,1,asymptomatic,130,131,0,0,115,1,1.2,...,0,0,0,1,0,1,0,0,0,1
301,57,0,nontypical,130,236,0,2,174,0,0.0,...,0,0,1,1,0,1,0,0,1,0


In [158]:
X.drop(columns = ['ChestPain', 'RestECG', 'Slope', 'Ca', 'Thal'], inplace = True)

In [159]:
y = df.iloc[:,-1]
y.replace(('Yes', 'No'),(1, 0), inplace=True)
y

0      0
1      1
2      1
3      0
4      0
      ..
298    1
299    1
300    1
301    1
302    0
Name: AHD, Length: 303, dtype: int64

In [160]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [161]:
X_train

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,MaxHR,ExAng,Oldpeak,ChestPain_nonanginal,ChestPain_nontypical,ChestPain_typical,RestECG_1,RestECG_2,Slope_2,Slope_3,Ca_1.0,Ca_2.0,Ca_3.0,Thal_normal,Thal_reversable
155,70,1,130,322,0,109,0,2.4,0,0,0,0,1,1,0,0,0,1,1,0
177,56,1,132,184,0,105,1,2.1,0,0,0,0,1,1,0,1,0,0,0,0
39,61,1,150,243,1,137,1,1.0,1,0,0,0,0,1,0,0,0,0,1,0
131,51,1,94,227,0,154,1,0.0,1,0,0,0,0,0,0,1,0,0,0,1
150,52,1,152,298,1,178,0,1.2,0,0,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,51,0,130,256,0,149,0,0.5,1,0,0,0,1,0,0,0,0,0,1,0
151,42,0,102,265,0,122,0,0.6,0,0,0,0,1,1,0,0,0,0,1,0
233,74,0,120,269,0,121,1,0.2,0,1,0,0,1,0,0,1,0,0,1,0
58,54,1,125,273,0,152,0,0.5,1,0,0,0,1,0,1,1,0,0,1,0


In [162]:
from sklearn.preprocessing import StandardScaler
sc1 = StandardScaler()
X_train.loc[:, ['Age', 'RestBP', 'MaxHR', 'Chol']] = sc1.fit_transform(X_train.loc[:,['Age', 'RestBP', 'MaxHR', 'Chol']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [163]:
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler()
X_train.loc[:,['Oldpeak']] = normalizer.fit_transform(X_train.loc[:, ['Oldpeak']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [164]:
X_train

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,MaxHR,ExAng,Oldpeak,ChestPain_nonanginal,ChestPain_nontypical,ChestPain_typical,RestECG_1,RestECG_2,Slope_2,Slope_3,Ca_1.0,Ca_2.0,Ca_3.0,Thal_normal,Thal_reversable
155,1.723728,1,-0.075833,1.431461,0,-1.722568,0,0.387097,0,0,0,0,1,1,0,0,0,1,1,0
177,0.179574,1,0.036067,-1.241453,0,-1.891973,1,0.338710,0,0,0,0,1,1,0,1,0,0,0,0
39,0.731058,1,1.043160,-0.098686,1,-0.536739,1,0.161290,1,0,0,0,0,1,0,0,0,0,1,0
131,-0.371910,1,-2.090018,-0.408589,0,0.183230,1,0.000000,1,0,0,0,0,0,0,1,0,0,0,1
150,-0.261613,1,1.155059,0.966606,1,1.199655,0,0.193548,0,0,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,-0.371910,0,-0.075833,0.153111,0,-0.028526,0,0.080645,1,0,0,0,1,0,0,0,0,0,1,0
151,-1.364580,0,-1.642422,0.327431,0,-1.172005,0,0.096774,0,0,0,0,1,1,0,0,0,0,1,0
233,2.164915,0,-0.635329,0.404907,0,-1.214356,1,0.032258,0,1,0,0,1,0,0,1,0,0,1,0
58,-0.041019,1,-0.355581,0.482383,0,0.098527,0,0.080645,1,0,0,0,1,0,1,1,0,0,1,0


Data Pre-processing complete

In [165]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[ 1, 36],
       [ 0, 24]], dtype=int64)

In [179]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 6)
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[37,  0],
       [24,  0]], dtype=int64)

In [180]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear')
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[ 1, 36],
       [ 0, 24]], dtype=int64)

In [181]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf')
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[ 0, 37],
       [ 0, 24]], dtype=int64)

In [182]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'poly')
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[36,  1],
       [24,  0]], dtype=int64)

In [185]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[ 2, 35],
       [ 3, 21]], dtype=int64)

In [198]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy')
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[29,  8],
       [ 6, 18]], dtype=int64)

In [204]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
classifier.fit(X_train, y_train)
yhat = classifier.predict(X_test)
cm = confusion_matrix(y_test, yhat)
cm

array([[26, 11],
       [ 4, 20]], dtype=int64)