# Welcome to My Notebook
# We are going to Predict the chances of Heart Attack by using Heart Dataset

![](https://img.freepik.com/premium-photo/pretty-realistic-heart-illustration-with-isolated-background_742252-4113.jpg)


# Import all the Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score,precision_score, recall_score, f1_score

import warnings 
warnings.filterwarnings("ignore")

# Lets Read the dataset unsing Pandas

In [None]:
dataframe=pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
dataframe.head()

# Exploratory Data Analysis

In [None]:
# Let see the datatype of each column
dataframe.info()

In [None]:
# check the shape of data
dataframe.shape

In [None]:
# Let see the satatistic of data
dataframe.describe()

In [None]:
# Lets check the duplicate values in the dataset
dataframe.duplicated().sum()

In [None]:
# Remove the duplicate values in the dataset
dataframe.drop_duplicates(inplace=True)

In [None]:
# Let see is there any null values in the dataset
dataframe.isnull().sum()

In [None]:
columns=dataframe.columns.tolist()
columns[13]="HeartAttack"
dataframe.columns=columns
dataframe.head()

# Let check the dataset is balanced or not

In [None]:
dataframe["HeartAttack"].value_counts()

In [None]:
# Make the correlation matrix
correlation_matrix= dataframe.corr()
correlation_matrix

# Lets visualise the correlation matrix

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="Blues")
plt.show()

# Univariate Analysis

In [None]:
fig=plt.figure(figsize=(12,12))

ax=fig.add_subplot(221)
sns.histplot(dataframe["age"], color="red", label="age", kde=True, ax=ax)
ax.set_title('Age', fontsize=16)


ax=fig.add_subplot(222)
sns.histplot(dataframe["thalachh"], color="green", label="age", kde=True, ax=ax)
ax.set_title('Maximum Heart Rate Achieved', fontsize=16)

ax=fig.add_subplot(223)
sns.histplot(dataframe["chol"], color="blue", label="age", kde=True, ax=ax)
ax.set_title('Cholestoral Level', fontsize=16)

ax=fig.add_subplot(224)
sns.histplot(dataframe["trtbps"], color="orange", label="age", kde=True, ax=ax)
ax.set_title('Resting Blood Pressure', fontsize=16)


plt.show()

In [None]:
dataframe["cp"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="green")
plt.title("Chest Pain Type", fontsize=14)
plt.xlabel("ChestPain")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["sex"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="red")
plt.title("Gender", fontsize=14)
plt.xlabel("sex")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["restecg"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="blue")
plt.title("Resting Electrocardiographic Results", fontsize=14)
plt.xlabel("restecg")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["exng"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="orange")
plt.title("Exercise Induced Angina ", fontsize=14)
plt.xlabel("exng")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["slp"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="grey")
plt.title("SLP", fontsize=14)
plt.xlabel("slp")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["caa"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="violet")
plt.title("Number of Major Vessels", fontsize=14)
plt.xlabel("caa")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["thall"].value_counts().plot(kind="bar", figsize=(6,4), rot=0, color="pink")
plt.title("Thall", fontsize=14)
plt.xlabel("thall")
plt.ylabel("Count")
plt.show()

# Bivariate Analysis

# Detecting Outliers

In [None]:
fig = px.box(dataframe, x="HeartAttack", y="age", title=f"Distrubution of Age")
fig.show()

In [None]:
fig = px.box(dataframe,x="HeartAttack", y="trtbps", title=f"Distrubution of Resting Blood Pressure",color="sex")
fig.show()

In [None]:
fig = px.box(dataframe,x="HeartAttack", y="chol", title=f"Distrubution of Cholesterol", color="sex")
fig.show()

In [None]:
fig = px.box(dataframe,x="HeartAttack", y="thalachh", title=f"Distrubution of Maximum Heart Rate")
fig.show()

In [None]:
fig = px.box(dataframe,x="HeartAttack", y="oldpeak", title=f"Distrubution of Previous Peak")
fig.show()

# Divide the Dataset into Train and Test Set

In [None]:
def train_test_split_data(dataframe,target,test_size, random_state):
    x_train,x_test, y_train, y_test= train_test_split(dataframe.drop([target], axis=1),
                                                      dataframe[target],
                                                      test_size=test_size,
                                                      random_state=random_state,
                                                      stratify=dataframe[target]
                                                      )
    
    return x_train,x_test, y_train, y_test

In [None]:
x_train,x_test, y_train, y_test= train_test_split_data(dataframe,target="HeartAttack",test_size=0.3, random_state=42)

In [None]:
x_train.shape,x_test.shape, y_train.shape, y_test.shape

# Get the Numerical and Categorical Columns list

In [None]:
def get_numerical_and_categorical_columns(dataframe):
    
    numerical_cols = []
    categorical_cols = []
    for column in dataframe.columns:
        if pd.api.types.is_numeric_dtype(dataframe[column]):
            numerical_cols.append(column)
        else:
            categorical_cols.append(column)
            
    return numerical_cols, categorical_cols

# Lets Detect the outliers in the Training Data And Remove it

In [None]:
def Winsorization_Method(columns, x_train, y_train , a, b):
    outliers=[]

    for col in columns:
        q1= np.percentile(x_train[col], a)
        q2= np.percentile(x_train[col],b)
        
        for pos in range(len(x_train)):
            if x_train[col].iloc[pos]>q2 or x_train[col].iloc[pos]<q1:
                outliers.append(pos) 
                
    outliers= set(outliers)                   # remove the duplicates from the outliers
    outliers= list(outliers)
    
    ratio= round(len(outliers)/len(x_train)*100, 2)                       # Ratio of outliers
    x_train.drop(x_train.index[outliers], inplace=True)    # remove the outliers from the training dataset
    y_train.drop(y_train.index[outliers], inplace=True)
    
    
    
    return ratio, x_train, y_train

In [None]:
ratio_of_outliers,x_train,y_train= Winsorization_Method(['age','trtbps','chol', 'thalachh','oldpeak'], x_train, y_train, a=0.5, b=99)

In [None]:
ratio_of_outliers

In [None]:
x_train.shape, y_train.shape

# Data Preprocessing
1. All the features are in numerical form so there is no need for categorical encoding
2. Lets do the Numerical Encoding

In [None]:
robust_scaler= RobustScaler()
x_train=robust_scaler.fit_transform(x_train)
x_test=robust_scaler.transform(x_test)

# Lets do the Modelling

In [None]:

def modelling(x_train, x_test, y_train, y_test):
    # create the empty list to store the results 
    accuracy=[]
    precision=[]
    auc_roc=[]
    recall=[]
    f1=[]


    # Lets create the list of models
    models=[LogisticRegression(),
    GaussianNB(),
    SVC(kernel="linear"),
    KNeighborsClassifier(n_neighbors=32),
    DecisionTreeClassifier(criterion="gini"),
    RandomForestClassifier(n_estimators=200,criterion="gini"),
    XGBClassifier()
     ]

    # Let iterate over the list of models and train and predict it
    for model in models:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        auc_roc.append(roc_auc_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))



    model_names = ['LogisticRegression','GaussianNB','SVC','KNeighborsClassifier','DecisionTreeClassifier','RandomForestClassifier','XGBClassifier']
    result_df = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Precision':precision, 'F1_Score':f1, 'AUC_ROC_score':auc_roc},index=model_names)
    result_df=result_df.sort_values(by="Recall", ascending=False)
    return result_df

In [None]:
result_df= modelling(x_train, x_test, y_train, y_test)
result_df

# Lets Visulaise the Results 

In [None]:
result_df.plot(kind="barh", figsize=(10, 7), grid=True).legend(bbox_to_anchor=(1.3,1));

🚀 
Hi Kagglers,

I hope you enjoyed exploring my notebook! If you found the work insightful or helpful, I kindly invite you to show your support by giving it an upvote. Your appreciation fuels my motivation to continue sharing valuable insights with the community.

Moreover, I believe in continuous improvement, and your feedback plays a crucial role in making my work even better. If you have any suggestions, comments, or thoughts, please don't hesitate to leave them in the comments section. Let's learn and grow together!

Thank you for being a part of this amazing journey. Here's to more exciting collaborations and knowledge sharing ahead. 🌟

![](https://media4.giphy.com/media/esCPJvFlCmYsF5tkau/giphy.gif?cid=ecf05e47k5px5yhc4gq1c5lidhrd0wnwbfnl7x8zn43ts9y9&ep=v1_gifs_search&rid=giphy.gif&ct=g)