In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score

import torch

In [None]:
df = pd.read_csv("/kaggle/input/heart-disease-dataset/heart.csv")

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
cats = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']
nums = ['age', 'trestbps', 'chol', 'thalach']

<h2 id="pies" style='font-size:48px;'>Categorical variables counts and ratios showed in pie charts 🥧</h2>

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))
k = 0
for i in range(3):
    for j in range(3):
        freqs = df[cats[k]].value_counts()
        axes[i][j].pie(freqs, labels=freqs.index, autopct='%0.2f%%')
        axes[i][j].legend(freqs.index, loc='best')
        title = "{}\n".format(cats[k])
        for p, l in zip(freqs.index, freqs):
            title += "{} - {} vals\n".format(p, l)
        title = title[:-2]
        axes[i][j].set_title(title)
        k += 1
        
plt.tight_layout()   
plt.show()

In [None]:
def pairplot_hue_combined(df, hue1, hue2, _vars):
    temp_df = df.copy()
    temp_df['{}_{}'.format(hue1, hue2)] = temp_df[hue1].astype(str) + ' ' + temp_df[hue2].astype(str)
    sns.pairplot(temp_df, vars=_vars, hue='{}_{}'.format(hue1, hue2))
    plt.show()
    freqs = temp_df['{}_{}'.format(hue1, hue2)].value_counts()
    plt.pie(freqs, labels=freqs.index, autopct='%0.2f%%')
    plt.show()

<h2 style='font-size:48px;'>Relationship of test results between patients with and without heart disease</h2>

In [None]:
sns.pairplot(df, vars=nums, hue='target')
plt.show()

<h2 style='font-size:36px;'>Relationship between age, gender, test results and how they influence whether patient has heart problem or not</h2>

<p style='font-size:24px;'>There are two classes combined</p>
<div style='font-size:24px;'>
    <ul style='display:inline-block;'>
        <li>
            1 - Male; 0 - does not have heart disease
        </li>
        <li>
            0 - Female; 0 - does not have heart disease
        </li>
        <li>
            0 - Female; 1 - has heart disease
        </li>
        <li>
            1 - Male; 1 - has heart disease
        </li>
    </ul>
</div>

In [None]:
pairplot_hue_combined(df, 'sex', 'target', nums)

<h1 style='font-size:32px;'>Relationship between age, chest pain, test results and how they influence whether patient has heart problem or not</h1>

<p style='font-size:24px;'>There are two classes combined</p>
<div style='font-size:24px;'>
    <ul style='display:inline-block;'>
        <li>
            0 - Chest Pain of type 1; 0 - no heart disease
        </li>
        <li>
            0 - Chest Pain of type 1; 1 - heart disease
        </li>
        <li>
            1 - Chest Pain of type 2; 1 - heart disease
        </li>
        <li>
            2 - Chest Pain of type 3; 1 - heart disease
        </li>
        <li>
            2 - Chest Pain of type 3; 0 - no heart disease
        </li>
        <li>
            1 - Chest Pain of type 2; 0 - no heart disease
        </li>
        <li>
            3 - Chest Pain of type 4; 1 - heart disease
        </li>
        <li>
            3 - Chest Pain of type 4; 0 - no heart disease
        </li>
    </ul>
</div>

In [None]:
pairplot_hue_combined(df, 'cp', 'target', nums)

<h2 style='font-size:48px;'>Relationship between numerical variables and data distribution showed in histplots, barplots and boxplots 📊</h2>

In [None]:
def plots_num_cat(df, x, y):
    fig, axes = plt.subplots(nrows=3, ncols=len(x), figsize=(15, 11))
    groped = df.groupby(y)
    for i in range(len(x)):
        sns.histplot(df, x=x[i], hue=y, ax=axes[0][i], kde=True)
        means = groped[x[i]].mean()
        sns.barplot(x=means.index, y=means, ax=axes[1][i])
        for container in axes[1][i].containers:
            axes[1][i].bar_label(container, size=12)
            
        sns.boxplot(x=df[y], y=df[x[i]], ax=axes[2][i])
        
    plt.suptitle("Values distributed by {}".format(y), size=20)
        
    plt.tight_layout()
    plt.show()

In [None]:
for i in cats:
    plots_num_cat(df, nums, i)

<h2 style='font-size:48px;'>Correlation map 🗺</h2>

In [None]:
plt.figure(figsize=(15, 15))
corr = df.corr()
sns.heatmap(corr, annot=True)
plt.show()

<h2 style='font-size:48px;'>Observing numerical data for general outliers distribution 👽</h2>

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
k = 0
for i in range(2):
    for j in range(2):
        sns.boxplot(df, x=nums[k], ax=axes[i][j])
        k += 1
        
plt.show()

In [None]:
def remove_outliers(df, x):
    perc = np.percentile(df[x], [0, 25, 50, 75, 100])
    iqr = perc[3] - perc[1]
    _min = perc[1] - iqr*1.5
    _max = perc[3] + iqr*1.5
    df.loc[df[x] < _min, x] = _min
    df.loc[df[x] > _max, x] = _max
    
    return df

In [None]:
for i in nums[1:]:
    df = remove_outliers(df, i)

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

<h2 style='font-size:48px;'>Training pipeline and evaluation 🏋️‍♀️</h2>

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    acc = accuracy_score(pred, y_test)
    acc *= 100
    title = "{}: {}%".format(name, round(acc, 2))
    print(title)
    print()
    return model

In [None]:
abc = AdaBoostClassifier(learning_rate=0.1)
rfc = RandomForestClassifier(max_depth=50)
svc = SVC(C=0.8)
lgr = LogisticRegression()
gnb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=3)
xgb = XGBClassifier()

models = [abc, rfc, svc, lgr, gnb, knn, xgb]

names = ['Ada Boost', 'Random Forest', 'SVC',
        'Logistic Regression', 'Naive Bayes', 'KNN',
        'XGB']

<h2 style='font-size:48px;'>ML algorithms performance 📈</h2>

In [None]:
trained = []
for i, j in zip(models, names):
    trained += [training(i, j)]

In [None]:
mms = MinMaxScaler()
x_n = mms.fit_transform(x)
x_n = torch.tensor(x_n).type(torch.FloatTensor)
y_n = torch.tensor(y).type(torch.FloatTensor)

In [None]:
xn_train, xn_test, yn_train, yn_test = train_test_split(x_n, y_n, random_state=42, test_size=0.2)

<h2 style='font-size:48px;'>Deep Learning model built using PyTorch 🔥</h2>

In [None]:
class HeartClassifier(torch.nn.Module):
    def __init__(self, in_channels, num_classes):
        super(HeartClassifier, self).__init__()
        self.layer = torch.nn.Linear(in_channels, 64)
        self.layer1 = torch.nn.Linear(64, 128)
        self.layer2 = torch.nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = self.layer(x)
        x = self.layer1(x)
        x = self.layer2(x)
        
        return torch.nn.functional.softmax(x, dim=1)

In [None]:
model = HeartClassifier(13, 2)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

<h2 style='font-size:48px;'> Deep Learning benchmark and performance evaluation 🚀</h2>

In [None]:
EPOCHS = 3000

train_history = []
val_history = []
acc_history = []

for i in range(1, EPOCHS+1):
    model.train()
    
    train_loss = 0
    
    optimizer.zero_grad()
    out = model(xn_train)
    loss = criterion(out, yn_train.long())
    if i%500 == 0:
        print("Epoch {} || train loss: {}".format(i, loss.item()/out.size(0)), end=' ')
    train_history += [loss.item()/out.size(0)]
    
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        out = model(xn_test)
        loss = criterion(out, yn_test.long())
        if i%500 == 0:
            print("|| val loss: {} || acc: {}".format(loss.item()/out.size(0), 
                                                      (out.argmax(1) == yn_test).sum().item()/out.size(0)))
            print()
        
    val_history += [loss.item()/out.size(0)]
    acc_history += [(out.argmax(1) == yn_test).sum().item()/out.size(0)]

In [None]:
epochs = list(range(1, EPOCHS+1))
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
axes[0].plot(epochs, train_history)
axes[0].set_title("Train loss history")
axes[1].plot(epochs, val_history)
axes[1].set_title("Val loss history")
axes[2].plot(epochs, acc_history)
axes[2].set_title("Accuracy history")