In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,plot_tree
#from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,recall_score,precision_score,confusion_matrix,classification_report,roc_auc_score,roc_curve
from sklearn.preprocessing import StandardScaler,LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'xgboost'

In [None]:
data = pd.read_csv('heart_2020_cleaned.csv')
data

# EDA

## 1. Shape of the data set

In [None]:
data.shape

## 2. Data types

In [None]:
data.dtypes

## 3. Check for null enteries

In [None]:
data.isna().sum()

## 4. Check for duplicates

In [None]:
data[data.duplicated()]

Since we have duplicate observations we need to drop them.

In [None]:
data = data.drop_duplicates()
data

In [None]:
data.to_csv('Heart_Disease_data_with_outliers.csv')

## 5. Summary of data 

In [None]:
data.describe()

In [None]:
data.describe(include='object')

In [None]:
data.info()

In [None]:
data.max()

## 6. About the features

We have total 18 features in that one is dependent variable 'Heart Disease' that is out output and other 17 are input features

In [None]:
columns = data.columns
columns

### For Smoking

In [None]:
pd.value_counts(data['Smoking'])

In [None]:
pd.crosstab(data['Smoking'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Smoking',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

### For AlcoholDrinking

In [None]:
pd.value_counts(data['AlcoholDrinking'])

In [None]:
pd.crosstab(data['AlcoholDrinking'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='AlcoholDrinking',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

###  For Stroke

In [None]:
pd.value_counts(data['Stroke'])

In [None]:
pd.crosstab(data['Stroke'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Stroke',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

### For DiffWalking

In [None]:
pd.value_counts(data['DiffWalking'])

In [None]:
pd.crosstab(data['DiffWalking'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='DiffWalking',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

###  For Sex

In [None]:
pd.value_counts(data['Sex'])

In [None]:
pd.crosstab(data['Sex'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Sex',hue='HeartDisease',data=data,order=['Male','Female'])
plt.show()

### For AgeCategory

In [None]:
pd.value_counts(data['AgeCategory'],ascending=True)

In [None]:
pd.crosstab(data['AgeCategory'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='AgeCategory',hue='HeartDisease',data=data,order=['18-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80 or older'])
plt.show()

### For Race

In [None]:
pd.value_counts(data['Race'])

In [None]:
pd.crosstab(data['Race'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='Race',hue='HeartDisease',data=data,order=['White','Hispanic','Black','Other','Asian','American Indian/Alaskan Native'])
plt.show()

### For Diabetic

In [None]:
pd.value_counts(data['Diabetic'])

In [None]:
pd.crosstab(data['Diabetic'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Diabetic',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

### For PhysicalActivity

In [None]:
pd.value_counts(data['PhysicalActivity'])

In [None]:
pd.crosstab(data['PhysicalActivity'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='PhysicalActivity',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

### For GenHealth

In [None]:
pd.value_counts(data['GenHealth'])

In [None]:
pd.crosstab(data['GenHealth'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='GenHealth',hue='HeartDisease',data=data,order=['Very good','Good','Excellent','Fair','Poor'])
plt.show()

### For Asthma

In [None]:
pd.value_counts(data['Asthma'])

In [None]:
pd.crosstab(data['Asthma'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Asthma',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

### For KidneyDisease

In [None]:
pd.value_counts(data['KidneyDisease'])

In [None]:
pd.crosstab(data['KidneyDisease'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='DiffWalking',hue='KidneyDisease',data=data,order=['Yes','No'])
plt.show()

### For SkinCancer

In [None]:
pd.value_counts(data['SkinCancer'])

In [None]:
pd.crosstab(data['SkinCancer'],data['HeartDisease'],margins=True)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='SkinCancer',hue='HeartDisease',data=data,order=['Yes','No'])
plt.show()

## 7. Distribution of input variables

In [None]:
data.dtypes

In [None]:
sns.distplot(a=data['BMI'],hist=False)
plt.show()

In [None]:
sns.distplot(a=data['PhysicalHealth'],hist=False)
plt.show()

In [None]:
sns.distplot(a=data['MentalHealth'],hist=False)
plt.show()

In [None]:
sns.distplot(a=data['SleepTime'],hist=False)
plt.show()

## 8. Finding Outliers

In [None]:
data.describe()

In [None]:
sns.boxplot(data=data['BMI'])
plt.show()

In [None]:
sns.boxplot(data=data['PhysicalHealth'])
plt.show()

In [None]:
sns.boxplot(data=data['MentalHealth'])
plt.show()

In [None]:
sns.boxplot(data=data['SleepTime'])
plt.show()

In [None]:
def finding_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    outliers = data[((data<(Q1-1.5*IQR)) | (data>(Q3+1.5*IQR)))]
    return outliers


In [None]:
outliers = finding_outliers(data)
outliers

In [None]:
def drop_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    not_outliers = data[~((data<(Q1-1.5*IQR)) | (data>(Q3+1.5*IQR)))]
    outliers_droped = not_outliers.dropna().reset_index()
    return outliers_droped
    

In [None]:
without_outlier_data = drop_outliers(data)
without_outlier_data

In [None]:
del without_outlier_data['index']

In [None]:
without_outlier_data.to_csv('Heart_Disease_data_without_outliers.csv')

In [None]:
301717 -222531

If we drop **outliers** then we will miss 79186 observations that means we are droping 25% of our data so we cant drop those many observations also if we drop outliers we will miss the major information about person **BMI** so we will work with data with outliers.

## 8. Cheaking for imbalence 

In [None]:
data.shape

In [None]:
pd.value_counts(data['HeartDisease'])

In [None]:
le = LabelEncoder()
data['HeartDisease'] = le.fit_transform(data['HeartDisease'])

In [None]:
plt.figure(figsize=(10,8))
plt.pie(x=data['HeartDisease'].value_counts(),explode=(0.05,0),autopct='%1.0f%%',labels=['No','Yes'])
plt.show()

Here we have 274456 observations who dont have Heart disease and only 27261 observations have Heart disease so ita a highly imbalenced data with a proportion of 10:1

##### Using over_sampling

In [None]:
class_count_yes,class_count_no = data['HeartDisease'].value_counts()
class_yes  = data[data['HeartDisease']==1]
class_no   = data[data['HeartDisease']==0]
print('Class Yes :',class_yes.shape)
print('Class No  :',class_no.shape)

In [None]:
class_yes_over = class_yes.sample(n=274456,random_state=15,replace=True)
final_data = pd.concat([class_yes_over,class_no],axis=0)

In [None]:
final_data.shape

In [None]:
final_data.value_counts(final_data['HeartDisease'])

In [None]:
final_data

In [None]:
final_data.to_csv('cleaned_heart_disease.csv')

In [None]:
plt.figure(figsize=(10,8))
plt.pie(x=final_data['HeartDisease'].value_counts(),explode=(0.05,0),autopct='%1.0f%%',labels=['Yes','No'])
plt.show()

# 9. Corelation between the features and Feature importance

### Encoding the object features

In [None]:
non_numeric_data = final_data.select_dtypes('object')
non_numeric_data = non_numeric_data.columns.values
for col in non_numeric_data:
    final_data[col] = LabelEncoder().fit_transform(final_data[col])

In [None]:
final_data.dtypes

### Seperating Input and Output features

In [None]:
X = final_data.drop(columns='HeartDisease')
y = final_data[['HeartDisease']]

In [None]:
X.shape,y.shape

### Scaling Input features

In [None]:
X_scaled = StandardScaler()
X_scaled = X_scaled.fit_transform(X)
X_scaled = pd.DataFrame(data=X_scaled,columns=X.columns)
X_scaled

### Check Corelation between input features

In [None]:
corr_matrix = X_scaled.corr()
corr_matrix

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(corr_matrix,annot=True)
plt.show()

There is no multicoliniarity between the input features

### Dividing the data set for training and testing

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,random_state=12,stratify=y)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

### Checking for feature importance using DecisionTreeClassifier

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)

In [None]:
imp_features = dt_model.feature_importances_
imp_features = pd.DataFrame(data={'Features':X_scaled.columns,'DT IMP Features':imp_features})
imp_features

In [None]:
imp_features = imp_features.sort_values(by='DT IMP Features',ascending=False)
imp_features

In [None]:
plt.figure(figsize=(14,8))
sns.barplot(x='DT IMP Features',y='Features',data=imp_features)
plt.title('Decision Tree Feature Importance',size=25)
plt.show()

# Model Selection

### Checking for best params using grid_search_cv by using DecisionTreeClassifier

In [None]:
grid_search = GridSearchCV(estimator=dt_model,param_grid={'criterion':['gini','entropy'],
                                                          'max_depth':[1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 ]})

In [None]:
%%time
grid_search= grid_search.fit(X_scaled,y)

In [None]:
grid_search.best_params_

### These are the models we are testing for our prediction

##### 1. LogisticRegression
##### 2.DecisionTreeClassifier
##### 3.RandomForestClassifier
##### 4.AdaBoostClassifier
##### 5.GradientBoostingClassifier
##### 6.XGBClassifier
##### 7.LGBMClassifier
##### 8.KNeighborsClassifier
##### 9.GaussianNB

### 1. LogisticRegression

In [None]:
%%time
logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train)

In [None]:
logistic_pred_train = logistic_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,logistic_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,logistic_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,logistic_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,logistic_pred_train))
print('Classification Report          :\n',classification_report(y_train,logistic_pred_train))

In [None]:
logistic_pred_test = logistic_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,logistic_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,logistic_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,logistic_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,logistic_pred_test))
print('Classification Report          :\n',classification_report(y_test,logistic_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,logistic_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, logistic_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 2.DecisionTreeClassifier

In [None]:
%%time
dt_model = DecisionTreeClassifier(random_state=15,max_depth=57)
dt_model.fit(X_train,y_train)

In [None]:
dt_pred_train = dt_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,dt_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,dt_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,dt_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,dt_pred_train))
print('Classification Report          :\n',classification_report(y_train,dt_pred_train))

In [None]:
dt_pred_test = dt_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,dt_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,dt_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,dt_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,dt_pred_test))
print('Classification Report          :\n',classification_report(y_test,dt_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,dt_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, dt_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 3. RandomForestClassifier

In [None]:
%%time
rf_model = RandomForestClassifier(random_state=15,max_depth=57)
rf_model.fit(X_train,y_train)

In [None]:
rf_pred_train = rf_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,rf_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,rf_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,rf_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,rf_pred_train))
print('Classification Report          :\n',classification_report(y_train,rf_pred_train))

In [None]:
rf_pred_test = rf_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,rf_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,rf_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,rf_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,rf_pred_test))
print('Classification Report          :\n',classification_report(y_test,rf_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,rf_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, rf_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 4. AdaBoostClassifier

In [None]:
%%time
adaboost_model = AdaBoostClassifier(random_state=15)
adaboost_model.fit(X_train,y_train)

In [None]:
adaboost_pred_train = adaboost_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,adaboost_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,adaboost_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,adaboost_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,adaboost_pred_train))
print('Classification Report          :\n',classification_report(y_train,adaboost_pred_train))

In [None]:
adaboost_pred_test = adaboost_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,adaboost_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,adaboost_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,adaboost_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,adaboost_pred_test))
print('Classification Report          :\n',classification_report(y_test,adaboost_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,adaboost_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, adaboost_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 5. GradientBoostingClassifier

In [None]:
%%time
gradient_model = GradientBoostingClassifier(random_state=15)
gradient_model.fit(X_train,y_train)

In [None]:
gradient_pred_train = gradient_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,gradient_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,gradient_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,gradient_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,gradient_pred_train))
print('Classification Report          :\n',classification_report(y_train,gradient_pred_train))

In [None]:
gradient_pred_test = gradient_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,gradient_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,gradient_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,gradient_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,gradient_pred_test))
print('Classification Report          :\n',classification_report(y_test,gradient_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,gradient_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, gradient_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 6. XGBClassifier

In [None]:
%%time
xgb_model = XGBClassifier(max_depth=57,random_state=15)
xgb_model.fit(X_train,y_train)

In [None]:
xgb_pred_train = xgb_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,xgb_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,xgb_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,xgb_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,xgb_pred_train))
print('Classification Report          :\n',classification_report(y_train,xgb_pred_train))

In [None]:
xgb_pred_test = xgb_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,xgb_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,xgb_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,xgb_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,xgb_pred_test))
print('Classification Report          :\n',classification_report(y_test,xgb_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,xgb_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, xgb_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 7. LGBMClassifier

In [None]:
%%time
lgb_model      = LGBMClassifier(max_depth=57,random_state=15)
lgb_model.fit(X_train,y_train)

In [None]:
lgb_pred_train = lgb_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,lgb_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,lgb_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,lgb_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,lgb_pred_train))
print('Classification Report          :\n',classification_report(y_train,lgb_pred_train))

In [None]:
lgb_pred_test = lgb_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,lgb_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,lgb_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,lgb_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,lgb_pred_test))
print('Classification Report          :\n',classification_report(y_test,lgb_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,lgb_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, lgb_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 8. KNeighborsClassifier

In [None]:
%%time
knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)

In [None]:
knn_pred_train = knn_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,knn_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,knn_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,knn_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,knn_pred_train))
print('Classification Report          :\n',classification_report(y_train,knn_pred_train))

In [None]:
knn_pred_test = knn_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,knn_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,knn_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,knn_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,knn_pred_test))
print('Classification Report          :\n',classification_report(y_test,knn_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,knn_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, knn_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### 9. GaussianNB

In [None]:
%%time
gauss_nb_model = GaussianNB()
gauss_nb_model.fit(X_train,y_train)

In [None]:
gauss_nb_pred_train = gauss_nb_model.predict(X_train)
print('Accuracy Score                 :',round(accuracy_score(y_train,gauss_nb_pred_train),4))
print('Precision Score                :',round(precision_score(y_train,gauss_nb_pred_train),4))
print('Recall Score                   :',round(recall_score(y_train,gauss_nb_pred_train),4))
print('Confusion Matrix               :\n',confusion_matrix(y_train,gauss_nb_pred_train))
print('Classification Report          :\n',classification_report(y_train,gauss_nb_pred_train))

In [None]:
gauss_nb_pred_test = gauss_nb_model.predict(X_test)
print('Accuracy Score                 :',round(accuracy_score(y_test,gauss_nb_pred_test),4))
print('Precision Score                :',round(precision_score(y_test,gauss_nb_pred_test),4))
print('Recall Score                   :',round(recall_score(y_test,gauss_nb_pred_test),4))
print('Confusion Matrix               :\n',confusion_matrix(y_test,gauss_nb_pred_test))
print('Classification Report          :\n',classification_report(y_test,gauss_nb_pred_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,gauss_nb_model.predict_proba (X_test)[:,1])
auc_score = roc_auc_score(y_test, gauss_nb_pred_test)
print('Area Under Curve     :',auc_score)
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

### Results Comparison

In [None]:
result_df = pd.DataFrame({'Models':['Logistic Regression','Decision Tree','Random Forest','Addaboost','Gradient Boosting','Extreme Gradient Boosting','Light Gradient Boosting','K-Nearest Neighbor','GaussianNB'],
                          'Training Accuracy':[0.7483,0.9979,0.9979,0.7587,0.7631,0.9979,0.7695,0.9156,0.7027],
                          'Test Accuracy':[0.7472,0.9500,0.9656,0.7565,0.7611,0.9634,0.7668,0.8832,0.7031],
                          'Precision Score':[0.7383,0.9098,0.9363,0.7480,0.7422,0.9326,0.7399,0.8146,0.7557],
                          'Recall Score':[0.7659,0.9990,0.9991,0.7738,0.8002,0.9991,0.8230,0.9932,0.6002],
                          'AUC Score':[0.7472,0.9500,0.9655,0.7565,0.7611,0.9634,0.7668,0.8832,0.7030]})
result_df

In [None]:
result_df = result_df.sort_values(by='Test Accuracy',ascending=False)
result_df

From above dataframe its clear that for random forest we got better accuracy as well as precision and recall score.

# Conclusion
**Input Features                =** 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke','PhysicalHealth', 'MentalHealth',                                             'DiffWalking', 'Sex', 'AgeCategory','Race', 'Diabetic', 'PhysicalActivity',                                                 'GenHealth', 'SleepTime','Asthma', 'KidneyDisease', 'SkinCancer'

**Output Feature                =** HeartDisease

**Data Set Type                 =**  Balenced data set using Oversampling

**Best Model                    =** Random Forest

**Train Accuracy Score         :** 0.9979	

**Test Accuracy Score          :** 0.9656

**Classification Report          :**

               precision    recall  f1-score   support
             
           0       1.00      0.93      0.96     68614
           1       0.94      1.00      0.97     68614


### Dumping the best Model

In [None]:
from pickle import dump
dump(rf_model,open('rf_model.pkl','wb'))