In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Keeping the main data frame in a separate data variable for future use
data = pd.read_csv("/kaggle/input/heart-attack-prediction-dataset/heart_attack_prediction_dataset.csv")

In [None]:
# Copying data data frame into a new variable for manupulation
df = data.copy()

In [None]:
df.head()

In [None]:
df.shape

<h1>Initial Observation</h1>
<p>We have 26 columns (25 features & 1 class) to work with.<br>As we are predicting heart attack risk, we select Heart Attack Risk as our class column.<br>Because Heart Attack Risk only consists of 0 & 1, this is a classification problem.<br>Our dataset consists of both Categorical & Quantitive features.</p>

In [None]:
# Get pie chart slice labels

labels = ["No = 0", "Yes = 1"]

# Get pie chart slice values
values = df["Heart Attack Risk"].value_counts().to_numpy()

# Create a pie chart of the number of customers for each country origin
plt.pie(values, labels=labels, autopct="%1.1f%%")
plt.title("Class Imbalance(Risk of Heart Attack)")
plt.show()

plt.bar(x = labels, height = values)
plt.title("Class Imbalance(Risk of Heart Attack)")
plt.show()

<p>As we can see, class imbalance is present in the dataset</p>

<h1>Data Pre-processing</h1>

<h2> Handling Null Values </h2>

In [None]:
# Detect Columns Containing Null Values
df.isna().sum()

<p>No Null value present in the current dataset</p>

<h2>Spliting Blood Pressure</h2>

In [None]:

#str = 180/120 = ["180", "120"]
df[['BP_Systolic', 'BP_Diastolic']] = df['Blood Pressure'].str.split('/', expand=True)

In [None]:
# Convert the columns to numeric
df['BP_Systolic'] = pd.to_numeric(df['BP_Systolic'])
df['BP_Diastolic'] = pd.to_numeric(df['BP_Diastolic'])

In [None]:
#Dropping Blood Pressure
df = df.drop("Blood Pressure", axis = 1)

<h2>Encoding</h2>

In [None]:
# Checking Datatypes

df.dtypes

In [None]:
# Columns we need to encode- 
# - Sex
# - Blood Pressure
# - Diet
# - Country
# - Continent 
# - Hemisphere

In [None]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [None]:
#Initialize LabelEncoder
encoder = LabelEncoder()

In [None]:
#Handle Encoding

for col_name in df.columns:
    
    if df[col_name].dtype == "object":
        df[col_name] = encoder.fit_transform(df[[col_name]])

In [None]:
#Check current datatype state

df.dtypes

<h2>Correlation Matrix</h2>

In [None]:
df.shape

In [None]:
#We have 25 features to work with (excluding the Heart Attack Risk column)

corr = df.corr()

#Correlation Heatmap

plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".3f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

<ul>
<h5>Key Findings</h5>
<li>Heart Attack risk has highest correlation with Diabetes, Cholestrol and Exercise Hours Per Weak</li>
<li>Heart attack Risk is not much dependent on Sedentary Hours Per Day</li>
<li>Alcohol Consumption has no stronger link with Heart Attack Risk</li>
<li>Smoking is not a major cause of Heart Attack</li> 
</ul>

<h1>Train Test Split(Before Scaling)</h1>

In [None]:
#Ml Model -> Train(X, y) -> Test(X) return y_test_predicted

In [None]:
X = df.drop("Heart Attack Risk",axis = 1) #Features #9000 - 70% train 30% test

In [None]:
y = df["Heart Attack Risk"] #Class #9000 - 70% train 30% test

In [None]:
#import train test split
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

<h1>Model Training<h1>

<h2>Decision Tree Classifier(Before Scaling)</h2>

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc_before_scaling = DecisionTreeClassifier()

In [None]:
dtc_before_scaling.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_predicted_train = dtc_before_scaling.predict(X_train)

In [None]:
dtc_train_score_before_scaling = accuracy_score(y_train,y_predicted_train)

In [None]:
print("Train Score:",dtc_train_score_before_scaling)

In [None]:
y_predicted_test =  dtc_before_scaling.predict(X_test)

In [None]:
dtc_test_score_before_scaling =accuracy_score(y_test,y_predicted_test)

In [None]:
print("Test Score:",dtc_test_score_before_scaling)

<h2>Scaling<h2>

In [None]:
df.dtypes

In [None]:
df.nunique()

In [None]:
#We can scale those columns where the nunique value is greater than 10

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
for col_name in df.columns:
    if df[col_name].nunique()>10: 
        df[col_name] = scaler.fit_transform(df[[col_name]])

In [None]:
df.head(5)

<h2>Train Test Split(After Scaling)<h2>

In [None]:
X = df.drop("Heart Attack Risk",axis = 1)

In [None]:
y = df["Heart Attack Risk"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

<h2>Decision Tree Classifier(After Scaling)</h2>

In [None]:
dtc_after_scaling = DecisionTreeClassifier()

In [None]:
dtc_after_scaling.fit(X_train,y_train)

In [None]:
y_predicted_train_dtc = dtc_after_scaling.predict(X_train)

In [None]:
dtc_train_score = accuracy_score(y_train,y_predicted_train_dtc)

In [None]:
print("Train Score:",dtc_train_score)

In [None]:
y_predicted_test_dtc =  dtc_after_scaling.predict(X_test)

In [None]:
dtc_test_score = accuracy_score(y_test,y_predicted_test_dtc)

In [None]:
print("Test Score:",dtc_test_score)

<h3>Comparison Before and After Scaling</h3>

In [None]:
plt.bar(x = ["train_accuracy_before_scaling", "train_accuracy_after_scaling"],
        height =[dtc_train_score_before_scaling, dtc_train_score] )

In [None]:
plt.bar(x = ["test_accuracy_before_scaling", "test_accuracy_after_scaling"],
        height =[dtc_test_score_before_scaling, dtc_test_score] )

<p>As we can see, the train and test accuracy of Decision Tree Classifier before and after scaling is quite marginal.
We are going to use the scaled train-test set from now on so that our ml models can process the data
more efficiently</p>

<h2>Random Forest Classifier</h2>

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train,y_train)

In [None]:
y_predicted_train_rfc = rfc.predict(X_train)

In [None]:
rfc_train_score = accuracy_score(y_train,y_predicted_train_rfc)

In [None]:
rfc_train_score

In [None]:
y_predicted_test_rfc = rfc.predict(X_test)

In [None]:
rfc_test_score = accuracy_score(y_test,y_predicted_test_rfc)

In [None]:
rfc_test_score

<h2>KNN(K-Nearest Neighbor)</h2>

In [None]:
#Import KNNClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(X_train,y_train)

In [None]:
y_predicted_train_knn =  knn.predict(X_train)

In [None]:
knn_train_score = accuracy_score(y_train,y_predicted_train_knn)

In [None]:
knn_train_score

In [None]:
y_predicted_test_knn = knn.predict(X_test)

In [None]:
knn_test_score = accuracy_score(y_test,y_predicted_test_knn)

In [None]:
knn_test_score

<h1>Comparison Analysis</h1>

<h2>Prediction Accuracy</h2>

In [None]:
#Train
plt.figure(figsize=(8,5))

labels = ["Decision Tree Classifier", "Random Forest Classifier", "K-Nearest Neighbor"]
values = [dtc_train_score, rfc_train_score, knn_train_score]

plt.bar(x = labels, height = values)
plt.title("Prediction Accuracy(Train Data)")
plt.show()

In [None]:
#Test
plt.figure(figsize=(8,5))

labels = ["Decision Tree Classifier", "Random Forest Classifier", "K-Nearest Neighbor"]
values = [dtc_test_score, rfc_test_score, knn_test_score]

plt.bar(x = labels, height = values)
plt.title("Prediction Accuracy(Test Data)")
plt.show()


<h2>Precision & Recall Score Analysis</h2>

In [None]:
import sklearn.metrics as mt

In [None]:
model_train_data = [y_predicted_train_dtc,y_predicted_train_rfc,y_predicted_train_knn]
model_test_data = [y_predicted_test_dtc,y_predicted_test_rfc,y_predicted_test_knn]

In [None]:
#Train
model_train_precision_scores = []
model_train_recall_scores = []

for model_data in model_train_data:
    model_train_precision_scores.append(mt.precision_score(model_data,y_train))
    model_train_recall_scores.append(mt.recall_score(model_data,y_train))

In [None]:
labels = ["Decision Tree Classifier", "Random Forest Classifier", "K-Nearest Neighbor"]
data = {
    'Recall': model_train_recall_scores,
    'Precision': model_train_precision_scores,
    
}

x = np.arange(len(labels))  # the label locations
width = 0.3  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout="constrained")

for attribute, measurement in data.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects,label_type="edge",padding=5)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_title('Recall & Precision Analysis(Train)')
ax.set_xticks(x + width, labels)
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 1.5)

plt.show()

In [None]:
#Test
model_test_precision_scores = []
model_test_recall_scores = []

for model_data in model_test_data:
    model_test_precision_scores.append(mt.precision_score(model_data,y_test))
    model_test_recall_scores.append(mt.recall_score(model_data,y_test))


In [None]:
data = {
    'Recall': model_test_recall_scores,
    'Precision': model_test_precision_scores,
}
print(data)

In [None]:
labels = ["Decision Tree Classifier", "Random Forest Classifier", "K-Nearest Neighbor"]
data = {
    'Recall': model_test_recall_scores,
    'Precision': model_test_precision_scores,
}

x = np.arange(len(labels))  # the label locations
width = 0.3  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout="constrained")

for attribute, measurement in data.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects,label_type="edge",padding=5)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_title('Recall & Precision Analysis(Test)')
ax.set_xticks(x + width, labels)
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 1.5)

plt.show()

<p>
Decision Tree Classifier:

Recall: 0.35
Precision: 0.34

Random Forest Classifier:

Recall: 0.45
Precision: 0.028

K-Nearest Neighbors (KNN):

Recall: 0.34
Precision: 0.27

These scores represent the performance of each respective classifier in terms of recall (sensitivity) and precision. The Decision Tree and KNN classifiers demonstrate relatively balanced performance, capturing a notable proportion of relevant instances while maintaining reasonable precision. In contrast, the Random Forest classifier exhibits higher recall but lower precision, suggesting a higher rate of false positives. The choice between these classifiers depends on the specific goals and priorities of the classification task, weighing the trade-off between sensitivity and precision.</p>

<h2>Confusion Matrix</h2>

In [None]:
#Test
sns.heatmap(mt.confusion_matrix(y_predicted_test_dtc,y_test),annot = True,fmt = "d")
plt.title("Decision Tree Classifier(Test)")
plt.show()

In [None]:
mt.confusion_matrix(y_predicted_test_rfc,y_test)

In [None]:
sns.heatmap(mt.confusion_matrix(y_predicted_test_rfc,y_test),annot = True,fmt = "d")
plt.title("Random Forest Classifier(Test)")
plt.show()

In [None]:
mt.confusion_matrix(y_predicted_test_knn,y_test)

In [None]:
sns.heatmap(mt.confusion_matrix(y_predicted_test_knn,y_test),annot = True,fmt = "d")
plt.title("K-Nearest Neighbor(Test)")
plt.show()

<p>
Decision Tree Classifier:

The Decision Tree model shows a relatively balanced performance with a notable number of True Positives (317) and True Negatives (1064). However, there are significant False Positives (625) and False Negatives (623), suggesting that while the model correctly identifies a substantial number of instances, it also makes a considerable number of errors in both positive and negative predictions.

Random Forest Classifier:

The Random Forest model exhibits a high number of True Negatives (1639), but the True Positives (34) are notably low. Moreover, there is a considerable number of False Positives (908) and a relatively small number of False Negatives (48). This indicates that the model tends to be conservative in making positive predictions, resulting in a high number of instances being falsely classified as negatives.

K-Nearest Neighbors (KNN):

The KNN model demonstrates a relatively balanced performance, with a moderate number of True Positives (275) and True Negatives (1225). However, there are also significant False Positives (667) and False Negatives (462), indicating that the model, like the Decision Tree, has room for improvement in reducing both types of errors.

In summary, each model has its strengths and weaknesses. The Decision Tree and KNN models show a more balanced trade-off between True Positives and True Negatives, while the Random Forest model seems to be more conservative in predicting positive instances.</p>