In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from numpy import asarray


print("Bibliothèques : OK")

df = pd.read_csv("../input/heart-failure-prediction/heart.csv")
df = df.dropna(axis=0)

#PRE-PROCESSING


#Remplace les valeurs de Sex en INT
df['Sex'].replace(['M','F'],[0,1], inplace = True)

#Remplace les valeurs de ChestPainType en INT
df['ChestPainType'].replace(['TA','ATA','NAP','ASY'],[0,1,2,3], inplace = True)

#Remplace les valeurs de RestingECG en INT
df['RestingECG'].replace(['Normal','ST','ST-T','LVH'],[0,1,2,3], inplace = True)

#Remplace les valeurs de ExerciseAngina en INT
df['ExerciseAngina'].replace(['N','Y'],[0,1], inplace = True)

#Remplace les valeurs de ST_Slope en INT
df['ST_Slope'].replace(['Up','Flat','Down'],[0,1,2], inplace = True)

#Separe mes features et ma target
y = df['HeartDisease']
X = df.drop('HeartDisease',axis=1)

#NORMALISATION - MinMaxScaler et Standard
X_minMax = MinMaxScaler().fit_transform(X)
X_standard = StandardScaler().fit_transform(X)


# **Séparation du DataSet**

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


#Creation de mes tableaux d'entrainements et de tests
X_train,X_test,y_train,y_test = train_test_split(X_standard,y,test_size = 0.20, random_state = 5)


# **CROSS-VALIDATION**

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve

# **GridSearchCV**

In [15]:
from sklearn.model_selection import GridSearchCV


# **Matrice de Confusion**

In [16]:
from sklearn.metrics import confusion_matrix

# **Learning Curve**

In [17]:
from sklearn.model_selection import learning_curve

# **GENERALISATION D'EVALUATION**

In [18]:
from sklearn.metrics import f1_score, classification_report

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC

RandomForrest = RandomForestClassifier(random_state=0)
AdaBoost = AdaBoostClassifier(random_state=0)
SVM = SVC()
KNN = KNeighborsClassifier()
GB = GradientBoostingClassifier(random_state=0)
BC = BaggingClassifier(random_state=0)

dict_of_models = {'RandomForest' : RandomForrest,
                  'AdaBoost' : AdaBoost,
                  'SVM' : SVM,
                  'KNN' : KNN,
                  'GB' : GB,
                  'BC' : BC}


def evaluation(model):
    
    model.fit(X_train,y_train)
    yPred = model.predict(X_test)
    print(confusion_matrix(y_test,yPred))
    print(classification_report(y_test,yPred))
    
    N,train_score,val_score = learning_curve(model,X_train,y_train,train_sizes = np.linspace(0.1,1.0,10),cv=4,scoring='accuracy')
    plt.figure(figsize=(12,8))
    plt.plot(N,train_score.mean(axis=1), label='train')
    plt.plot(N,val_score.mean(axis=1), label='Validation')
    plt.legend()
    
for name,model in dict_of_models.items():
    print(name)
    evaluation(model)
#ok

* RandomForest : OverFitting 
* AdaBoost :     A exploiter 
* SVC :          A exploiter 
* KNN :          A oublier   
* GB : A oublier 
* BC : A oublier 


# **OPTIMISATION**

In [19]:
#AdaBoost

hyper_params = {'n_estimators' : [1,10,50,100,500],
                'learning_rate' : [0.0001,0.001,0.01,0.1,1.0]}
                

grid = GridSearchCV(AdaBoost,hyper_params,scoring='accuracy',cv=4)

grid.fit(X_train,y_train)
print(grid.best_params_)

y_pred = grid.predict(X_test)
print(classification_report(y_test,y_pred))

In [20]:
evaluation(grid.best_estimator_)

In [21]:
model = grid.best_estimator_

#On applique l'entrainement aux tests
print("Meilleur score TEST : ",model.score(X_test,y_test)," avec les paramètres : ", grid.best_estimator_)