In [173]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [174]:
dfh = pd.read_csv("heart2.csv")
dfh.head(2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0


In [175]:
pd.set_option("display.float", "{:.3f}".format)
dfh.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434,0.696,0.942,131.612,246.0,0.149,0.53,149.114,0.337,1.072,1.385,0.754,2.324,0.513
std,9.072,0.46,1.03,17.517,51.593,0.357,0.528,23.006,0.473,1.175,0.618,1.031,0.621,0.5
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [176]:
dfh.target.value_counts()

1    526
0    499
Name: target, dtype: int64

In [177]:
cat_val = []
num_val = []
for column in dfh.columns:
    if len(dfh[column].unique()) <= 10:
        cat_val.append(column)
    else:
        num_val.append(column)

In [178]:
cat_val

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [179]:
num_val

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [180]:
preprocessor = ColumnTransformer(
    transformers=[
        ('sc', StandardScaler(), ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']),
        ('ohe', OneHotEncoder(), ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])])

In [181]:
X = dfh.drop('target', axis=1)
y = dfh.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [182]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.000,2,2,3
1,53,1,0,140,203,1,0,155,1,3.100,0,0,3
2,70,1,0,145,174,0,1,125,1,2.600,0,0,3
3,61,1,0,148,203,0,1,161,0,0.000,2,1,3
4,62,0,0,138,294,1,1,106,0,1.900,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.000,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.800,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.000,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.000,2,0,2


In [183]:
preprocessor.fit_transform(X_train,X_test)

array([[ 0.51227152,  0.46363159, -0.48467262, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.40223597, -0.2188468 , -0.58820922, ...,  0.        ,
         0.        ,  1.        ],
       [-1.1382618 , -0.78757879, -0.04981887, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.36801292,  0.46363159,  1.13049845, ...,  0.        ,
         0.        ,  1.        ],
       [-1.24829735, -1.24256438, -0.69174583, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25797736, -1.12881798, -0.29830673, ...,  0.        ,
         1.        ,  0.        ]])

In [184]:

def aff_score(clf, X_test, y_test):
    pred = clf.predict(X_test)
    clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
    print("Resultat:\n================================================")        
    print(f"Score de précision: {accuracy_score(y_test, pred) * 100:.3f}%")
    print("_______________________________________________")
    print(f"RAPPORT DE CLASSIFICATION:\n{clf_report}")
    print("_______________________________________________")
    print(f"Matrix de Confusion: \n {confusion_matrix(y_test, pred)}\n")

In [185]:
rfc = RandomForestClassifier(n_estimators=1000, random_state=42)
rfc.fit(X_train, y_train)

aff_score(rfc, X_test, y_test)

Resultat:
Score de précision: 98.052%
_______________________________________________
RAPPORT DE CLASSIFICATION:
                0       1  accuracy  macro avg  weighted avg
precision   0.964   1.000     0.981      0.982         0.981
recall      1.000   0.960     0.981      0.980         0.981
f1-score    0.981   0.979     0.981      0.980         0.980
support   159.000 149.000     0.981    308.000       308.000
_______________________________________________
Matrix de Confusion: 
 [[159   0]
 [  6 143]]



In [186]:
test_score = accuracy_score(y_test, rfc.predict(X_test)) * 100

res = pd.DataFrame(data=[["Random Forest Classifier",  test_score]], columns=['Model', 'Test d''exactitude %'])

res

Unnamed: 0,Model,Test dexactitude %
0,Random Forest Classifier,98.052


In [187]:
xgb = XGBClassifier(use_label_encoder=False)
xgb.fit(X_train, y_train)

aff_score(xgb, X_test, y_test)

Resultat:
Score de précision: 99.026%
_______________________________________________
RAPPORT DE CLASSIFICATION:
                0       1  accuracy  macro avg  weighted avg
precision   0.981   1.000     0.990      0.991         0.990
recall      1.000   0.980     0.990      0.990         0.990
f1-score    0.991   0.990     0.990      0.990         0.990
support   159.000 149.000     0.990    308.000       308.000
_______________________________________________
Matrix de Confusion: 
 [[159   0]
 [  3 146]]



In [188]:
test_score = accuracy_score(y_test, xgb.predict(X_test)) * 100

res1 = pd.DataFrame(data=[["XGBoost Classifier",  test_score]], columns=['Model', 'Test d''exactitude %'])
res = res.append(res1, ignore_index=True)
res

Unnamed: 0,Model,Test dexactitude %
0,Random Forest Classifier,98.052
1,XGBoost Classifier,99.026


In [189]:
n_estimators = [500, 900, 1100, 1500]
max_features = ['auto', 'sqrt']

max_depth = [2, 3, 5, 10, 15, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

params = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth, 
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
              }

rfc = RandomForestClassifier(random_state=42)
rfcv = GridSearchCV(rfc, params, scoring="accuracy", cv=3, verbose=1, n_jobs=-1)
rfcv.fit(X_train, y_train)
best_params = rfcv.best_params_
print(f"Meilleurs parametres: {best_params}")

rfc = RandomForestClassifier(**best_params)
rfc.fit(X_train, y_train)

aff_score(rfc, X_test, y_test)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Meilleurs parametres: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1100}


  warn(


Resultat:
Score de précision: 98.052%
_______________________________________________
RAPPORT DE CLASSIFICATION:
                0       1  accuracy  macro avg  weighted avg
precision   0.964   1.000     0.981      0.982         0.981
recall      1.000   0.960     0.981      0.980         0.981
f1-score    0.981   0.979     0.981      0.980         0.980
support   159.000 149.000     0.981    308.000       308.000
_______________________________________________
Matrix de Confusion: 
 [[159   0]
 [  6 143]]



In [190]:
test_score = accuracy_score(y_test, rfc.predict(X_test)) * 100

updres = pd.DataFrame(data=[["Tuned Random Forest Classifier", test_score]], columns=['Model', 'Testing Accuracy %'])
updres

Unnamed: 0,Model,Testing Accuracy %
0,Tuned Random Forest Classifier,98.052


In [191]:
param = dict(
    n_estimators=stats.randint(10, 1000),
    max_depth=stats.randint(1, 10),
    learning_rate=stats.uniform(0, 1)
)

xgb = XGBClassifier(use_label_encoder=False)
xgbcv = RandomizedSearchCV(xgb, param, cv=5, n_iter=50, scoring='accuracy', n_jobs=-1, verbose=1)
xgbcv.fit(X_train, y_train)
best_params = xgbcv.best_params_
print(f"Meilleurs parametres: {best_params}")

xgb = XGBClassifier(**best_params)
xgb.fit(X_train, y_train)


aff_score(xgb, X_test, y_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Meilleurs parametres: {'learning_rate': 0.6429844968371161, 'max_depth': 9, 'n_estimators': 101}
Resultat:
Score de précision: 99.026%
_______________________________________________
RAPPORT DE CLASSIFICATION:
                0       1  accuracy  macro avg  weighted avg
precision   0.981   1.000     0.990      0.991         0.990
recall      1.000   0.980     0.990      0.990         0.990
f1-score    0.991   0.990     0.990      0.990         0.990
support   159.000 149.000     0.990    308.000       308.000
_______________________________________________
Matrix de Confusion: 
 [[159   0]
 [  3 146]]



In [192]:
test_score = accuracy_score(y_test, xgb.predict(X_test)) * 100

res1 = pd.DataFrame(data=[["Tuned XGBoost Classifier", test_score]], columns=['Model','Testing Accuracy %'])
pd.concat([updres, res1],ignore_index=True)

Unnamed: 0,Model,Testing Accuracy %
0,Tuned Random Forest Classifier,98.052
1,Tuned XGBoost Classifier,99.026


In [203]:
fullpipe = Pipeline(steps =[
    
    ('preprocessor', preprocessor),
   
    ('model', XGBClassifier(learning_rate= 0.64298449683711615, max_depth= 9, n_estimators= 101))
])

In [204]:
model = XGBClassifier(learning_rate= 0.64298449683711615, max_depth= 9, n_estimators= 101)
model.fit(X_train,y_train)

In [205]:
fullpipe

In [206]:
fullpipe.fit(X_train, y_train)

In [207]:
# Pour maximiser la reproductibilité, nous aimerions utiliser ce pipeline à plusieurs reprises pour nos nouvelles données entrantes. 
# Sauvegardons le pipeline en utilisant le paquet 'joblib' pour le sauvegarder dans un fichier pickle.

joblib.dump(fullpipe, 'health_pipeline1.pkl')

# Maintenant nous pouvons appeler ce pipeline, qui inclut toutes sortes de prétraitements de données dont nous avons besoin.
loaded_pipeline = joblib.load('health_pipeline1.pkl')


In [208]:
dfh1 = pd.read_csv("heart2.csv")
dfh1 = dfh1.drop('target', axis=1)
dfh1.to_csv('test.csv', index=False)
dfh1.head(2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3


In [213]:
sample1 = [[34,0,1,118,210,0,1,192,0,0.7,2,0,2]]
sample2 = [[58,0,0,100,248,0,0,122,0,1.0,1,0,2]]

sample = pd.DataFrame(sample1, columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'])

In [214]:

if loaded_pipeline.predict(sample) == 0:
    print('PATIENT EN ETAT NORMAL')
else:
    print('ATTENTION!!! PATIENT EN ETAT CRITIQUE')

ATTENTION!!! PATIENT EN ETAT CRITIQUE
