### 0. Librairies

In [1]:
# Step 1: Import the required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from IPython.display import Image

### 1. Loading Data

In [2]:
# Step 2: Generate the dataset
os.chdir('C:/Users/Simplonco/Documents/jupyter-notebook/datasets')
data = pd.read_table('heart.txt')

In [3]:
# Step 3: Make Copy
df = data.copy()
df

Unnamed: 0,age,sexe,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,58,masculin,100,234,A,A,156,non,1,1,B,presence
1,41,feminin,130,204,A,C,172,non,14,1,A,absence
2,58,masculin,150,270,A,C,111,oui,8,1,A,presence
3,53,masculin,140,203,B,C,155,oui,31,3,A,presence
4,41,masculin,130,214,A,C,168,non,20,2,A,absence
...,...,...,...,...,...,...,...,...,...,...,...,...
265,62,feminin,140,268,A,C,160,non,36,3,C,presence
266,62,masculin,130,231,A,A,146,non,18,2,D,absence
267,64,masculin,110,211,A,C,144,oui,18,2,A,absence
268,63,feminin,150,407,A,C,154,non,40,2,D,presence


### 2. PréProcessing

In [4]:
df.columns

Index(['age', 'sexe', 'pression', 'cholester', 'sucre', 'electro', 'taux_max',
       'angine', 'depression', 'pic', 'vaisseau', 'coeur'],
      dtype='object')

In [5]:
df_quanti = df.select_dtypes(include="number")
df_quanti = df_quanti.drop('pic', axis="columns")
df_quali = df.select_dtypes(exclude="number")

df_quali = pd.concat([df_quali, df['pic']], axis="columns")

In [6]:
df_quanti

Unnamed: 0,age,pression,cholester,taux_max,depression
0,58,100,234,156,1
1,41,130,204,172,14
2,58,150,270,111,8
3,53,140,203,155,31
4,41,130,214,168,20
...,...,...,...,...,...
265,62,140,268,160,36
266,62,130,231,146,18
267,64,110,211,144,18
268,63,150,407,154,40


In [7]:
df_quali

Unnamed: 0,sexe,sucre,electro,angine,vaisseau,coeur,pic
0,masculin,A,A,non,B,presence,1
1,feminin,A,C,non,A,absence,1
2,masculin,A,C,oui,A,presence,1
3,masculin,B,C,oui,A,presence,3
4,masculin,A,C,non,A,absence,2
...,...,...,...,...,...,...,...
265,feminin,A,C,non,C,presence,3
266,masculin,A,A,non,D,absence,2
267,masculin,A,C,oui,A,absence,2
268,feminin,A,C,non,D,presence,2


In [8]:
y = df_quali['coeur']
y = y.map({'absence':0,'presence':1})
y

0      1
1      0
2      1
3      1
4      0
      ..
265    1
266    0
267    0
268    1
269    0
Name: coeur, Length: 270, dtype: int64

In [9]:
df_quali = df_quali.drop('coeur', axis="columns")
df_quali= df_quali.drop('pic', axis="columns")
df_quali

Unnamed: 0,sexe,sucre,electro,angine,vaisseau
0,masculin,A,A,non,B
1,feminin,A,C,non,A
2,masculin,A,C,oui,A
3,masculin,B,C,oui,A
4,masculin,A,C,non,A
...,...,...,...,...,...
265,feminin,A,C,non,C
266,masculin,A,A,non,D
267,masculin,A,C,oui,A
268,feminin,A,C,non,D


In [10]:
# dummy
list_var = list(df_quali.columns)
df_quali_dumy = pd.get_dummies(df_quali[list_var], drop_first=True)
df_quali_dumy

Unnamed: 0,sexe_masculin,sucre_B,electro_B,electro_C,angine_oui,vaisseau_B,vaisseau_C,vaisseau_D
0,1,0,0,0,0,1,0,0
1,0,0,0,1,0,0,0,0
2,1,0,0,1,1,0,0,0
3,1,1,0,1,1,0,0,0
4,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
265,0,0,0,1,0,0,1,0
266,1,0,0,0,0,0,0,1
267,1,0,0,1,1,0,0,0
268,0,0,0,1,0,0,0,1


In [11]:
# Feature Scaling
sc = StandardScaler()
df_quanti = pd.DataFrame(sc.fit_transform(df_quanti), columns=df_quanti.columns)
df_quanti

# onehotencoder = OneHotEncoder(sparse=False)
# df_quali = pd.DataFrame(onehotencoder.fit_transform(df_quali))
# print(df_quali)

Unnamed: 0,age,pression,cholester,taux_max,depression
0,0.392278,-1.758109,-0.303530,0.273420,-0.831083
1,-1.477460,-0.075410,-0.885033,0.965378,0.306188
2,0.392278,1.046389,0.394274,-1.672713,-0.218706
3,-0.157645,0.485490,-0.904417,0.230172,1.793389
4,-1.477460,-0.075410,-0.691199,0.792388,0.831083
...,...,...,...,...,...
265,0.832217,0.485490,0.355507,0.446409,2.230801
266,0.832217,-0.075410,-0.361681,-0.159054,0.656118
267,1.052186,-1.197209,-0.749349,-0.245549,0.656118
268,0.942201,1.046389,3.049805,0.186925,2.580731


In [12]:
df = pd.concat([df_quali_dumy, df['pic'], df_quanti, y], axis="columns")
df

Unnamed: 0,sexe_masculin,sucre_B,electro_B,electro_C,angine_oui,vaisseau_B,vaisseau_C,vaisseau_D,pic,age,pression,cholester,taux_max,depression,coeur
0,1,0,0,0,0,1,0,0,1,0.392278,-1.758109,-0.303530,0.273420,-0.831083,1
1,0,0,0,1,0,0,0,0,1,-1.477460,-0.075410,-0.885033,0.965378,0.306188,0
2,1,0,0,1,1,0,0,0,1,0.392278,1.046389,0.394274,-1.672713,-0.218706,1
3,1,1,0,1,1,0,0,0,3,-0.157645,0.485490,-0.904417,0.230172,1.793389,1
4,1,0,0,1,0,0,0,0,2,-1.477460,-0.075410,-0.691199,0.792388,0.831083,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,0,0,0,1,0,0,1,0,3,0.832217,0.485490,0.355507,0.446409,2.230801,1
266,1,0,0,0,0,0,0,1,2,0.832217,-0.075410,-0.361681,-0.159054,0.656118,0
267,1,0,0,1,1,0,0,0,2,1.052186,-1.197209,-0.749349,-0.245549,0.656118,0
268,0,0,0,1,0,0,0,1,2,0.942201,1.046389,3.049805,0.186925,2.580731,1


### 3. Modelisation

In [13]:
X = df.drop('coeur', axis="columns")
y = df['coeur']

In [14]:
X_train, X_test,y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [15]:
# Step 5: Perform Logistic Regression / buid model
classifier = LogisticRegression(random_state=0, solver='liblinear')
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [16]:
# Step 6: Make prediction using the Model
# Perform prediction using the test dataset
y_pred = classifier.predict(X_test)
y_pred

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1], dtype=int64)

### 4. Evaluation

In [17]:
#Evaluation (Performance du modèle)
s = classifier.score(X_test,y_test)
print(f'Le score du model est :{s}')

Le score du model est :0.8703703703703703


In [18]:
# Step 7 : Display mertics
#**Confusion Matrix**
cm = confusion_matrix(y_test, y_pred)
cm = pd.DataFrame(cm, index=['absence','presence'], columns=['absence','presence'])
cm
#print(cm) #total=37

Unnamed: 0,absence,presence
absence,31,6
presence,1,16


In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90        37
           1       0.73      0.94      0.82        17

    accuracy                           0.87        54
   macro avg       0.85      0.89      0.86        54
weighted avg       0.89      0.87      0.87        54



In [20]:
def taux_erreur(x1, x2):
    contingence = pd.crosstab(x1, x2)
    N = contingence.sum().sum()
    Num = contingence.iloc[1, 0] + contingence.iloc[0, 1]
    return round(Num/N * 100, 2)

In [21]:
taux_erreur(y_test, y_pred)

12.96

#### 4. Commentaires