# Heart Disease Detection
### About the Data set

This dataset gives the information realated to heart disease. Dataset contain 13 columns, target is the class variable which is affected by other 12 columns. Here the aim is to classify the target variable to (disease\non disease) using different machine learning algorithm and findout which algorithm suitable for this dataset.

### Features Information:

* Age : age in years
* Sex : 1 = male; 0 = female
* CP  : chest pain type
* TRESTBPS : resting blood pressure (in mm * Hg on admission to the hospital)
* CHOL : serum cholestoral in mg/dl
* FPS : fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
* RESTECH : resting electrocardiographic results
* THALACH : maximum heart rate achieved
* EXANG : exercise induced angina (1 = yes; 0 = no)
* OLDPEAK : ST depression induced by exercise relative to rest
* SLOPE : the slope of the peak exercise ST segment
* CA : number of major vessels (0-3) colored by flourosopy 
* THAL : 3 = normal; 6 = fixed defect; 7 = reversable defect
* TARGET : 1 or 0



In [None]:
# loading dataset

import pandas as pd
import numpy as np

# visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# EDA
from collections import Counter

# data preprocessing
from sklearn.preprocessing import StandardScaler

# data splitting
from sklearn.model_selection import train_test_split

# data modeling
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# tuning
from sklearn.model_selection import GridSearchCV

# ensembling
from sklearn.ensemble import StackingClassifier

In [None]:
df = pd.read_csv('../input/health-care-data-set-on-heart-attack-possibility/heart.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
X = df.drop('target',axis=1)
y = df["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(y_test.unique())
Counter(y_train)

### Algorithms used: 
An Ensemble stack of: 
* Logistic Regression
* Random Forest Classifier
* Extreme Gradient Boost
* K-Nearest Neighbour
* Support Vector Machine

# Logistic Regression

In [None]:
parameters = {'penalty': ['l1', 'l2'], 
              'C': [0.1, 0.4, 0.8, 1, 2, 5,10,20,30]}    

grid_search=GridSearchCV(estimator=LogisticRegression() ,param_grid=parameters,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

log_reg = grid_search.best_estimator_

grid_search.best_score_

In [None]:
y_pred=log_reg.predict(X_test)

print("\n",confusion_matrix(y_test,y_pred))
log_reg_acc = accuracy_score(y_test,y_pred)

print("\nAccuracy Score {}".format(log_reg_acc))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

parameters = {'criterion' : ['gini', 'entropy'],
              'max_depth': [2, 4, 5, 7, 9, 10],
              'n_estimators' : [10,20,50,100]}


grid_search=GridSearchCV(estimator=rf ,param_grid=parameters,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

rf = grid_search.best_estimator_

In [None]:
y_pred=rf.predict(X_test)

print("\n",confusion_matrix(y_test,y_pred))
rf_acc = accuracy_score(y_test,y_pred)
print("\nAccuracy Score {}".format(rf_acc))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

# K Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier()


parameters = {
    'n_neighbors' : np.arange(1,40),
    'algorithm' :['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : np.arange(1,40)
}

grid_search=GridSearchCV(estimator=kn ,param_grid=parameters,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

kn = grid_search.best_estimator_

In [None]:
y_pred=kn.predict(X_test)

print("\n",confusion_matrix(y_test,y_pred))
kn_acc = accuracy_score(y_test,y_pred)
print("\nAccuracy Score {}".format(kn_acc))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

# SVC

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC()

parameters = {
      'penalty':['l1', 'l2'],
      'max_iter': [10,20,50,100,1000], 
      'C': [0.1, 0.4, 0.8, 1, 2, 5,10,20,30],          
              }

grid_search=GridSearchCV(estimator=svc ,param_grid=parameters,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

svc = grid_search.best_estimator_


In [None]:
y_pred=svc.predict(X_test)

print("\n",confusion_matrix(y_test,y_pred))
svc_acc = accuracy_score(y_test,y_pred)
print("\nAccuracy Score {}".format(svc_acc))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

# Extreme Gradient Boosting

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

parameters = {'min_child_weight' : np.arange(0,20),
              'max_depth': [2, 4, 5, 7, 9, 10]}

grid_search=GridSearchCV(estimator=xgb ,param_grid=parameters,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

xgb = grid_search.best_estimator_


In [None]:
y_pred=xgb.predict(X_test)

print("\n",confusion_matrix(y_test,y_pred))
xgb_acc = accuracy_score(y_test,y_pred)
print("\nAccuracy Score {}".format(xgb_acc))
print("Classification report: \n{}".format(classification_report(y_test,y_pred)))

# Model Eval

In [None]:
model_ev = pd.DataFrame({
                         'Model': ['Logistic Regression','Random Forest','K Neighbors Classifier','Support Vector Clasifier','XGBoost'], 
                         'Accuracy': [log_reg_acc*100, rf_acc*100, kn_acc*100,svc_acc*100,xgb_acc*100]
                         })
model_ev

In [None]:
colors = ['red','green','blue','yellow','orange',]

plt.figure(figsize=(12,5))
plt.title("Accuracy Graph")
plt.xlabel("Accuracy %")
plt.ylabel("Algorithms")
plt.bar(model_ev['Model'],model_ev['Accuracy'],color = colors)
plt.show()

# Stacking 

In [None]:
estimators = [ ('xgb', xgb ), 
              ('kn', kn ),
              ('svc',svc ),
              ('log_Reg', log_reg),
              ('rf',rf)]


scv=StackingClassifier(estimators=estimators ,final_estimator= svc)

scv.fit(X_train,y_train)
scv_predicted = scv.predict(X_test)

scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)

print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))