# Importation des modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import (
    precision_score, recall_score,
    f1_score, accuracy_score, roc_auc_score, balanced_accuracy_score)

from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

## Telechargememnt du dataset

In [None]:
!wget -O 'heart.csv' 'https://drive.google.com/uc?export=download&id=1If7zRUhRgQl9vs_FGdd8hb41cDqZZ6iM'

--2024-10-28 13:19:17--  https://drive.google.com/uc?export=download&id=1If7zRUhRgQl9vs_FGdd8hb41cDqZZ6iM
Resolving drive.google.com (drive.google.com)... 173.194.217.113, 173.194.217.101, 173.194.217.100, ...
Connecting to drive.google.com (drive.google.com)|173.194.217.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1If7zRUhRgQl9vs_FGdd8hb41cDqZZ6iM&export=download [following]
--2024-10-28 13:19:17--  https://drive.usercontent.google.com/download?id=1If7zRUhRgQl9vs_FGdd8hb41cDqZZ6iM&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 108.177.12.132, 2607:f8b0:400c:c0c::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|108.177.12.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11279 (11K) [application/octet-stream]
Saving to: ‘heart.csv’


2024-10-28 13:19:19 (61.1 MB/s) - ‘heart.csv’ saved [11279/11279]



## 1. Visualisation et pretaitent du dataset

In [None]:
# load dataset
df_full = pd.read_csv('heart.csv')

In [None]:
# display dataset first 3 rows
df_full.head(3)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,F,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,F,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,M,1,130,204,0,0,172,0,1.4,2,0,2,1


In [None]:
# display dataset size
df_full.shape

(303, 14)

In [None]:
# -> dataset contains 303 recors with 14 features

In [None]:
# display all columns names
df_full.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [None]:
# describe columns
'''
| Variable     | Description                                                        |
|--------------|--------------------------------------------------------------------|
| age          | Âge de la personne                                                 |
| sex          | Sexe de la personne                                                |
| cp           | Type de douleur thoracique                                         |
| trtbps       | Pression artérielle au repos (en mm Hg)                            |
| chol         | Cholestérol en mg/dl (mesuré via un capteur d'IMC)                 |
| fbs          | Glycémie à jeun (> 120 mg/dl) (1 = vrai; 0 = faux)                 |
| restecg      | Résultats de l'électrocardiogramme au repos                        |
| thalachh     | Fréquence cardiaque maximale atteinte                              |
| exng         | Angine induite par l'exercice (1 = oui; 0 = non)                   |
| oldpeak      | Ancien pic                                                         |
| slp          | Inclinaison du segment ST                                          |
| caa          | Nombre de vaisseaux colorés par fluoroscopie                       |
| thall        | Thalassémie (0 = normale; 1 = défaut corrigé; 2 = défaut réversible)|
| output       | Target, (1 = hearth attack, 0 = pas de crise cardiaque)                                                             |
'''
None

In [None]:
# data type of each column
df_full.dtypes

Unnamed: 0,0
age,int64
sex,object
cp,int64
trtbps,int64
chol,int64
fbs,int64
restecg,int64
thalachh,int64
exng,int64
oldpeak,float64


In [None]:
# check missing values
df_full.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trtbps,0
chol,0
fbs,0
restecg,0
thalachh,0
exng,0
oldpeak,0


In [None]:
# -> No missing values

In [None]:
# sex column as type 'objet'.
# we need to transforme it to numerical value
# to do this, we encode 1 if sex is male (M) and 0 is sex is female (F)

df_full['sex'] = df_full['sex'].apply(lambda x: 1 if x == 'M' else 0)
df_full.head(3) # now sex is encoded with 0 or 1

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,0,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,0,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,1,1,130,204,0,0,172,0,1.4,2,0,2,1


In [None]:
# display repartition of classes
fig = px.pie(
    df_full['output'].value_counts().reset_index(),
    names=['Heart Attack', 'non-Heart Attack'],
    values='count',
    title='Repartition of Heart Attack / non-Heart Attack',
    width=500,
    height=500,
)
fig.update_layout(
    font=dict(color='RebeccaPurple'),
    title=dict(font=dict(size=12), automargin=True, yref='paper')
)
fig.show()

In [None]:
fig = make_subplots(rows=5, cols=3, subplot_titles=df_full.columns)
col = 1
row = 1
for name in df_full.columns:
    g = go.Histogram(
        x=df_full[name], name=name, nbinsx=150
    )
    fig.append_trace(g, row=row, col=col,)
    row = row + 1 if col == 3 else row
    col = 0 if col == 3 else col
    col += 1
title = 'Features Distribution'
fig.update_layout(
    height=750, width=900, title_text=title,
    title=title, plot_bgcolor='#ffffff',
    font=dict(color='RebeccaPurple')
)
fig.update_yaxes(
    mirror=True, ticks='outside',
    showline=True,
    gridcolor='lightgrey'
)
fig.update_annotations(font=dict(size=10))
fig.show()

In [None]:
# Analyse each distribution.
# For exemple, we are patients with age between 30 old and 70 old. That is suggest heath attack concerne these olds ?
# Same for all distribution

In [None]:
# Analyse of correlation features.
# we use person metric. explan person metric
# Here, you'll try to detect correlated features and try to explan that is normal with the reality.
# From corr figure, can we predict Hearth attack ?

df_full_corr = df_full.corr(method='pearson')
fig = px.imshow(df_full_corr, text_auto='.1f')
title = f'Pearson Feature Correlations'
fig.update_layout(
    height=700, width=800,
    title=title, plot_bgcolor='#ffffff',
    font=dict(color='RebeccaPurple')
)
fig.show(config={'displayModeBar': False})

## 2. Machine Learning

In [None]:
# Separate features columns and target column.
features_columns = ['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall']
target_column = 'output'

In [None]:
# define X for features and y
X = df_full[features_columns].values
y = df_full[target_column].values.tolist()

In [None]:
# split dataset. 20% reseved to test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# define models
svm =  SVC(kernel='linear', random_state=42)
xgb = XGBClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
logistic_regression = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
kneigh_bors = KNeighborsClassifier()
lgbm = LGBMClassifier(random_state=42, verbose=-1)

In [None]:
# train all models
for model in svm, xgb, random_forest, logistic_regression, decision_tree, kneigh_bors, lgbm:
  model.fit(X_train, y_train)
  print(f'Train complete for {type(model).__name__}')

In [None]:
print(f"{'Nom du modèle':<30} | {'Métrique':<15} | {'Valeur':<10}")
for model in svm, xgb, random_forest, logistic_regression, decision_tree, kneigh_bors, lgbm:
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
  recall = recall_score(y_true=y_test, y_pred=y_pred)
  precision = precision_score(y_true=y_test, y_pred=y_pred)
  f1 = f1_score(y_true=y_test, y_pred=y_pred)
  model_name = type(model).__name__
  print("-" * 50)
  print(f"{model_name:<30} | {'Précision':<20} | {precision:.2f}")
  print(f"{'':<30} | {'Rappel':<20} | {recall:.2f}")
  print(f"{'':<30} | {'Exactitude':<20} | {accuracy:.2f}")
  print(f"{'':<30} | {'Score F1':<20} | {f1:.2f}")

Nom du modèle                  | Métrique        | Valeur    
--------------------------------------------------
SVC                            | Précision            | 0.88
                               | Rappel               | 0.88
                               | Exactitude           | 0.87
                               | Score F1             | 0.88
--------------------------------------------------
XGBClassifier                  | Précision            | 0.86
                               | Rappel               | 0.78
                               | Exactitude           | 0.82
                               | Score F1             | 0.82
--------------------------------------------------
RandomForestClassifier         | Précision            | 0.85
                               | Rappel               | 0.88
                               | Exactitude           | 0.85
                               | Score F1             | 0.86
--------------------------------------------------
Lo

In [None]:
print(f"{'Nom du modèle':<30} | {'Métrique':<15} | {'Valeur':<10}")
for model in svm, xgb, random_forest, logistic_regression, decision_tree, kneigh_bors, lgbm:
  y_pred = model.predict(X_test)
  cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
  cm = cm / cm.sum()
  cm = np.round(cm, 1)
  fig = make_subplots(
      rows=1, cols=1, horizontal_spacing=0.2,
      subplot_titles=[],
  )
  g = go.Heatmap(
      z=cm, text=cm,texttemplate='%{text}%',
      x=['Negative', 'Postive'],
      y=['Negative', 'Postive']
  )
  fig.append_trace(g, row=1, col=1)
  title = f''
  fig.update_layout(
      height=500, width=500, title_text=title,
      title=title, plot_bgcolor='#ffffff',
      font=dict(color='RebeccaPurple')
  )
  fig.update_xaxes(
      # title_text='pred labels',
      mirror=True, ticks='outside',
      showline=True, linecolor='black',
      # gridcolor='lightgrey'
  )
  fig.update_yaxes(
      title_text='true labels',
      mirror=False, ticks='outside',
      showline=False, # linecolor='black',
      gridcolor='lightgrey',
      autorange='reversed',
  )
  fig.update_annotations(font=dict(size=10))
  fig.show()

Nom du modèle                  | Métrique        | Valeur    
