In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#import libraries and dependencies
import pandas as pd
import numpy as np
import warnings 
import seaborn as sns
from matplotlib import pyplot as plt

### Import the dataset

In [None]:
heart=pd.read_csv('../input/heart-disease/heart.csv')
heart.head()

In [None]:
heart.shape

#### Data Exploration

In [None]:
#We have 13 features and one target variable (have disease or not ,1=yes, 0=no) for which we need to build a model
heart.target.value_counts()

In [None]:
print("Percentage of Patients Not Having Heart Disease: {:.2f}% " .format(len(heart[heart.target==0])/(len(heart.target))*100))
print("Percentage of Patients Having Heart Disease: {:.2f}%" .format (len(heart[heart.target==1])/(len(heart.target))*100))

In [None]:
#checking null values in the dataset
heart.isnull().sum()

In [None]:
#checking for unique values in feature columns
heart.nunique()

In [None]:
 # Datatype check for the dataframe
heart.dtypes

##### Visualization for Target Variable

In [None]:
sns.countplot(x='target',data=heart, palette="mako_r")
plt.xlabel("target(0 = no disease, 1= have disease)")
plt.ylabel("Count of People ")
plt.show()

#### Creating summary for feature columns

In [None]:
#UDF for summary

def summary_features(x):
    return pd.Series([x.count(),x.isnull().sum(),x.sum(),x.mean(),x.median(),x.std(),
                      x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                      x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75),
                      x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()],
                     index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [None]:
# Features summary
cols=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']
features=heart[cols]
features.apply(summary_features)

### Building the Model

##### data split for training and testing

In [None]:
# import the package
from sklearn.model_selection import train_test_split

train_X,test_X,train_Y,test_Y=train_test_split(heart[cols],heart['target'],test_size=0.3,random_state=123)

### Building the Logistic Regression model

In [None]:
#import the package
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix,accuracy_score,roc_curve
accuracies={}

In [None]:
# define the model equation and fit the model on train data
logreg = LogisticRegression( max_iter = 1000 ).fit( train_X,train_Y )

In [None]:
# get the coefficients for reference
pd.DataFrame( index = pd.Series(cols), data = logreg.coef_[0], columns = ['coefficient'] )

In [None]:
# Accuracy Score for Logistic Regression model
print("Logistic Regression Accuracy on Train Data: ",accuracy_score(train_Y,logreg.predict(train_X))*100)
print("Logistic Regression Accuracy on Test Data: ",accuracy_score(test_Y,logreg.predict(test_X))*100)
accuracies['LogReg']=round((accuracy_score(test_Y,logreg.predict(test_X))*100),2)

#### Logistic Regression model works with 87% accuracy on Train data and with 81% accuracy on Test data

### 2) Building a KNN Model

In [None]:
#import the packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [None]:
#set the tuning parameters (k) for Grid Search CV
tuned_param={'n_neighbors':range(2,10,1)}

#Build th KNN model
KNN_clf=GridSearchCV(KNeighborsClassifier(),tuned_param,cv=5,scoring='roc_auc').fit(train_X,train_Y)


In [None]:
#Checking for best parameters for building the KNN model
KNN_clf.best_params_

In [None]:
#Building the KNN model with the best parameter  of n_neighbours=5
KNN_clf=KNeighborsClassifier(n_neighbors=5).fit(train_X,train_Y)

# get the model accuracy
# get the model accuracy
print("KNN model Accuracy on train data : ",accuracy_score(train_Y,KNN_clf.predict(train_X))*100)
print("KNN model Accuracy on test data : ",accuracy_score(test_Y,KNN_clf.predict(test_X))*100)
accuracies['KNN']=round((accuracy_score(test_Y,KNN_clf.predict(test_X))*100),2)

#### KNN model works with 76% accuracy on Train data and with 66% accuracy on Test data

#### 3) Building Support Vector Machine model

In [None]:
#import the package
from sklearn.svm import SVC

In [None]:
#Build the SVM model
svm_clf=SVC(random_state=123).fit(train_X,train_Y)

In [None]:
# get the model accuracy
print("SVM model Accuracy on Train Data : ",accuracy_score(train_Y,svm_clf.predict(train_X))*100)
print("SVM model Accuracy on Test Data : ",accuracy_score(test_Y,svm_clf.predict(test_X))*100)
accuracies['SVM']=round((accuracy_score(test_Y,svm_clf.predict(test_X))*100),2)

#### SVM model works with 65% accuracy on Train data and test data

#### 4) Building Naive Bayes Model

In [None]:
#import the package
from sklearn.naive_bayes import GaussianNB

#Build the model
nb_clf=GaussianNB()
nb_clf.fit(train_X,train_Y)

# get the Naive Bayes model accuracy
print("NB model Accuracy on Train Data : ",accuracy_score(train_Y,nb_clf.predict(train_X))*100)
print("NB model Accuracy on Test Data : ",accuracy_score(test_Y,nb_clf.predict(test_X))*100)
accuracies['NaiveBayes']=round((accuracy_score(test_Y,nb_clf.predict(test_X))*100),2)

#### Naive Bayes model works with 85% accuracy on Train data and with 81% accuracy on Test data

#### 5)Building a Decision Tree Model

In [None]:
#import the package
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#Using Grid Search CV for hyperparameter tuning of maximum depth of tree
params={"max_depth":range(2,10)}

#Build the decision tree model
Dt_clf=GridSearchCV(DecisionTreeClassifier(random_state=123),params,cv=5,scoring='roc_auc').fit(train_X,train_Y)

In [None]:
#Getting the best param to build the Decision Tree model
Dt_clf.best_params_

In [None]:
#Building the Decision model with the best parameter  of 'max_depth': 3
DT_clf=DecisionTreeClassifier(max_depth=3,random_state=123).fit(train_X,train_Y)

# get the model accuracy
# get the model accuracy
print("Decision Tree model Accuracy on train data : ",accuracy_score(train_Y,DT_clf.predict(train_X))*100)
print("Decision Tree model Accuracy on test data : ",accuracy_score(test_Y,DT_clf.predict(test_X))*100)
accuracies['DecisionTree']=round((accuracy_score(test_Y,DT_clf.predict(test_X))*100),2)

#### Decision Tree model works with 87% accuracy on Train data and with 80% accuracy on Test data

#### 6)Building the Random Forest Model

In [None]:
#Import the package
from sklearn.ensemble import RandomForestClassifier

In [None]:
##Using Grid Search CV for hyperparameter tuning of maximum depth of tree
pargrid_rf = { 'n_estimators': range(100,1100,100)}

# Build the random Forest model
Rf_clf=GridSearchCV(RandomForestClassifier(random_state=12),param_grid=pargrid_rf,cv=5,scoring='roc_auc',n_jobs=-1).fit(train_X,train_Y)

In [None]:
#Getting the best param to build the Random Forest model
Rf_clf.best_params_

In [None]:
#Building the Random forest model with the best parameter  of 'n_estimators': 200
RF_clf=RandomForestClassifier(n_estimators=200,random_state=123).fit(train_X,train_Y)

# get the model accuracy
# get the model accuracy
print("Random Forest model Accuracy on train data : ",accuracy_score(train_Y,RF_clf.predict(train_X))*100)
print("Random Forest model Accuracy on test data : ",accuracy_score(test_Y,RF_clf.predict(test_X))*100)
accuracies['RandomForest']=round((accuracy_score(test_Y,RF_clf.predict(test_X))*100),2)

#### Random Forest model works with 100% accuracy on Train data and with 82% accuracy on Test data

In [None]:
#Creating a Accuracy dataframe for different model
Accuracy_model=pd.DataFrame([accuracies])
Accuracies_model=Accuracy_model.T
Accuracies_model.columns=['Accuracy']
Accuracies_model

In [None]:
colors = ["red", "green", "orange", "pink","yellow","blue"]

sns.set_style("whitegrid")
plt.figure(figsize=(20,10))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=Accuracies_model.index, y=Accuracies_model.Accuracy, palette=colors)
plt.show()

#### Our models work fine but best of them are Logreg ,NB and Random Forest with 81% of accuracy.

### Confusion Matrix for Various models

In [None]:
Log_cm=metrics.confusion_matrix(test_Y,logreg.predict(test_X))
KNN_cm=metrics.confusion_matrix(test_Y,KNN_clf.predict(test_X))
SVM_cm=metrics.confusion_matrix(test_Y,svm_clf.predict(test_X))
NB_cm=metrics.confusion_matrix(test_Y,nb_clf.predict(test_X))
DT_cm=metrics.confusion_matrix(test_Y,DT_clf.predict(test_X))
RF_cm=metrics.confusion_matrix(test_Y,RF_clf.predict(test_X))

In [None]:
#Plotting Confusion Matrices
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrices",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix")
sns.heatmap(Log_cm, annot=True,  fmt='.2f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.subplot(2,3,2)
plt.title("K Nearest Neighbour Confusion Matrix")
sns.heatmap(KNN_cm, annot=True,  fmt='.2f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix")
sns.heatmap(SVM_cm, annot=True,  fmt='.2f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.subplot(2,3,4)
plt.title("Naive Bayes Confusion Matrix")
sns.heatmap(NB_cm, annot=True,  fmt='.2f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.subplot(2,3,5)
plt.title("Decision Tree Confusion Matrix")
sns.heatmap(DT_cm, annot=True,  fmt='.2f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.subplot(2,3,6)
plt.title("Random Forest Confusion Matrix")
sns.heatmap(RF_cm, annot=True,  fmt='.2f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.show()
