# IMPORTS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# LOADING THE DATA

In [2]:
df= pd.read_csv("heart1.csv")
df= df.drop(['oldpeak','slp','thall'],axis=1)
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,caa,output
0,63,1,3,145,233,1,0,150,0,0,1
1,37,1,2,130,250,0,1,187,0,0,1
2,41,0,1,130,204,0,0,172,0,0,1
3,56,1,1,120,236,0,1,178,0,0,1
4,57,0,0,120,354,0,1,163,1,0,1


# STANDARDIZING THE DATA


In [3]:
scale=StandardScaler()
scale.fit(df)
df= scale.transform(df)
df=pd.DataFrame(df,columns=['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'caa', 'output'])

# SPLITTING DATA INTO TRAIN DATA AND TEST DATA

In [4]:
x= df.iloc[:,:-1]
print(x)
y= df.iloc[:,-1:]
print(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)
lbl= LabelEncoder()
encoded_y= lbl.fit_transform(y_train)

          age       sex        cp    trtbps      chol       fbs   restecg  \
0    0.952197  0.681005  1.973123  0.763956 -0.256334  2.394438 -1.005832   
1   -1.915313  0.681005  1.002577 -0.092738  0.072199 -0.417635  0.898962   
2   -1.474158 -1.468418  0.032031 -0.092738 -0.816773 -0.417635 -1.005832   
3    0.180175  0.681005  0.032031 -0.663867 -0.198357 -0.417635  0.898962   
4    0.290464 -1.468418 -0.938515 -0.663867  2.082050 -0.417635  0.898962   
..        ...       ...       ...       ...       ...       ...       ...   
298  0.290464 -1.468418 -0.938515  0.478391 -0.101730 -0.417635  0.898962   
299 -1.033002  0.681005  1.973123 -1.234996  0.342756 -0.417635  0.898962   
300  1.503641  0.681005 -0.938515  0.706843 -1.029353  2.394438  0.898962   
301  0.290464  0.681005 -0.938515 -0.092738 -2.227533 -0.417635  0.898962   
302  0.290464 -1.468418  0.032031 -0.092738 -0.198357 -0.417635 -1.005832   

     thalachh      exng       caa  
0    0.015443 -0.696631 -0.714429  
1  

  y = column_or_1d(y, warn=True)


# LOGISTIC REGRESSION

In [5]:
logreg= LogisticRegression()
logreg = LogisticRegression()
logreg.fit(x_train, encoded_y)
from sklearn.metrics import confusion_matrix
encoded_ytest= lbl.fit_transform(y_test)
Y_pred1 = logreg.predict(x_test)
lr_conf_matrix = confusion_matrix(encoded_ytest,Y_pred1 )
lr_acc_score = accuracy_score(encoded_ytest, Y_pred1)
print("Logistic regression accuracy =",lr_acc_score*100,"%") # Printing the accuracy
print(lr_conf_matrix) # Printing the confusion matrix

Logistic regression accuracy = 85.71428571428571 %
[[35  9]
 [ 4 43]]


  y = column_or_1d(y, warn=True)



# DECISION TREE CLASSIFICATION ALGO

In [6]:
tree= DecisionTreeClassifier()
tree.fit(x_train,encoded_y)
ypred2=tree.predict(x_test)
encoded_ytest= lbl.fit_transform(y_test)
tree_conf_matrix = confusion_matrix(encoded_ytest,ypred2 )
tree_acc_score = accuracy_score(encoded_ytest, ypred2)
print("Decision tree accuracy =",tree_acc_score*100,"%") # Printing the accuracy
print(tree_conf_matrix) # Printing the confusion matrix

Decision tree accuracy = 68.13186813186813 %
[[26 18]
 [11 36]]


  y = column_or_1d(y, warn=True)


# RANDOM FOREST ALGO

In [7]:
rf= RandomForestClassifier()
rf.fit(x_train,encoded_y)
ypred3 = rf.predict(x_test)
rf_conf_matrix = confusion_matrix(encoded_ytest,ypred3 )
rf_acc_score = accuracy_score(encoded_ytest, ypred3)
rf_conf_matrix = confusion_matrix(encoded_ytest,ypred3 )
rf_acc_score = accuracy_score(encoded_ytest, ypred3)
print("Random forest accuracy =",rf_acc_score*100,"%") # Printing the accuracy
print(rf_conf_matrix) # Printing the confusion matrix

Random forest accuracy = 80.21978021978022 %
[[33 11]
 [ 7 40]]


# K-NEAREST NEIGHBOUR ALGO

In [8]:
error_rate= []
for i in range(1,40):
    knn= KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,encoded_y)
    pred= knn.predict(x_test)
    error_rate.append(np.mean(pred != encoded_ytest))
knn= KNeighborsClassifier(n_neighbors=12)
knn.fit(x_train,encoded_y)
ypred4= knn.predict(x_test)
knn_conf_matrix = confusion_matrix(encoded_ytest,ypred4 )
knn_acc_score = accuracy_score(encoded_ytest, ypred4) 
print("K-nearest neighbour accuracy =",knn_acc_score*100,"%") # Printing the accuracy
print(knn_conf_matrix) # Printing the confusion matrix

K-nearest neighbour accuracy = 84.61538461538461 %
[[35  9]
 [ 5 42]]


# SUPPORT VECTOR MACHINE ALGO

In [9]:
svm= svm.SVC()
svm.fit(x_train,encoded_y)
ypred5= svm.predict(x_test)
svm_conf_matrix = confusion_matrix(encoded_ytest,ypred5)
svm_acc_score = accuracy_score(encoded_ytest, ypred5)
print("SVM accuracy =",svm_acc_score*100,"%") # Printing the accuracy
print(svm_conf_matrix) # Printing the confusion matrix

SVM accuracy = 80.21978021978022 %
[[34 10]
 [ 8 39]]


# Accuracy in Table form 

In [10]:
model_acc= pd.DataFrame({'Model' : ['Logistic Regression','Decision Tree','Random Forest','K Nearest Neighbor','SVM'],'Accuracy' : [lr_acc_score*100,tree_acc_score*100,rf_acc_score*100,knn_acc_score*100,svm_acc_score*100]})
model_acc = model_acc.sort_values(by=['Accuracy'],ascending=False)
print(model_acc)
print("Most accurate model is Logistic Regression")

                 Model   Accuracy
0  Logistic Regression  85.714286
3   K Nearest Neighbor  84.615385
2        Random Forest  80.219780
4                  SVM  80.219780
1        Decision Tree  68.131868
Most accurate model is Logistic Regression
