In [None]:
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
import joblib
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, make_scorer

param_grids = {
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [5, 10]},
    'Neural Network': {'hidden_layer_sizes': [(100,), (50, 50)], 'activation': ['relu', 'tanh']},
    'XGBoost': {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01], 'n_estimators': [100, 200]},
}
#Xgboost
#random forest
#Reseau de neurone
#Linear regression

# Validation croisée avec plusieurs métriques
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

numerical_col = ['Temperature[C]','Humidity[%]','TVOC[ppb]','eCO2[ppm]','Raw H2','Raw Ethanol','Pressure[hPa]','PM1.0','PM2.5','NC0.5','NC1.0','NC2.5']
data = pd.read_csv('../data/processed/clean_dataset_.csv')
data_inc = data[data['Fire Alarm']==1]
data_no_inc = data[data['Fire Alarm']== 0]
mean_values_inc = data_inc[numerical_col].mean()
mean_values_no_inc = data_no_inc[numerical_col].mean()
print("valeur moyenne quand il y a incendie \n", mean_values_inc, "\n")
print("valeur moyenne quand il y a pas incendie \n", mean_values_no_inc)

components = pd.read_csv('../data/processed/principalComponents_.csv')

X = components #composantes principales obtenu après l'ACP
y = data['Fire Alarm'] #variable cible

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#fonction qui permet d'évaluer chaque modèle
def evaluer_modele(y_test, y_pred, y_pred_proba):
    """
    Calcule et affiche les principales métriques d'évaluation d'un modèle de classification.
    Args:
        y_test (array-like): Les vraies étiquettes.
        y_pred (array-like): Les étiquettes prédites par le modèle.
        y_pred_proba (array-like): Les probabilités prédites pour la classe positive.
    """

    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Création du DataFrame pour afficher les résultats
    data = {'Métrique': ['Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC', 'Matrice de confusion'],
            'Valeur': [accuracy, precision, recall, f1, roc_auc, conf_matrix]}
    df = pd.DataFrame(data)

    print(df)




valeur moyenne quand il y a incendie 
 Temperature[C]       14.901369
Humidity[%]          51.367693
TVOC[ppb]           810.682730
eCO2[ppm]           405.007016
Raw H2            13025.023771
Raw Ethanol       19675.249566
Pressure[hPa]       939.071326
PM1.0                 1.593498
PM2.5                 1.655554
NC0.5                10.967304
NC1.0                 1.710226
NC2.5                 0.038628
dtype: float64 

valeur moyenne quand il y a pas incendie 
 Temperature[C]       18.200738
Humidity[%]          48.546970
TVOC[ppb]            76.003601
eCO2[ppm]           405.579496
Raw H2            12945.476028
Raw Ethanol       20269.400985
Pressure[hPa]       938.676613
PM1.0                 1.193409
PM2.5                 1.241376
NC0.5                 8.210595
NC1.0                 1.282602
NC2.5                 0.030301
dtype: float64


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'n_estimators': [10, 30, 50, 100],
    'max_depth': [None, 10, 20]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid.fit(X_train, y_train)
print(f"meilleur paramètres: ", grid.best_params_)
print(f"meilleur score: ", grid.best_score_)

#modèle le plus performants

# Créer un modèle de forêt aléatoire
model1 = grid.best_estimator_
# Entraîner le modèle sur toutes les données (après avoir ajusté les hyperparamètres si nécessaire)
model1.fit(X_train, y_train)
#validation croisée avec multiscoring
scores = cross_validate(model1, X_train, y_train, cv=10, scoring=scoring)
# Afficher les scores moyens
print("----------------------------------Score moyen d'entrainement---------------------------------------")
print("Accuracy moyenne:", scores['test_accuracy'].mean())
print("Precision moyenne:", scores['test_precision'].mean())
print("Recall moyenne:", scores['test_recall'].mean())
print("F1-score moyen:", scores['test_f1'].mean())
print("AUC moyenne:", scores['test_roc_auc'].mean())

y_pred1 = model1.predict(X_test)

scores = cross_validate(model1, X_test, y_test, cv=10, scoring=scoring)
# Afficher les scores moyens
print("----------------------------------Score moyen après prédictions---------------------------------------")
print("Accuracy moyenne:", scores['test_accuracy'].mean())
print("Precision moyenne:", scores['test_precision'].mean())
print("Recall moyenne:", scores['test_recall'].mean())
print("F1-score moyen:", scores['test_f1'].mean())
print("AUC moyenne:", scores['test_roc_auc'].mean())
mat_conf = confusion_matrix(y_test, y_pred1)
print("\n", mat_conf)
joblib.dump(model1, '../models/randomForest.joblib')

meilleur paramètres:  {'max_depth': None, 'n_estimators': 100}
meilleur score:  0.9694358220883116
----------------------------------Score moyen d'entrainement---------------------------------------
Accuracy moyenne: 0.9698739964398483
Precision moyenne: 0.9763835407588971
Recall moyenne: 0.9828560413404889
F1-score moyen: 0.979607206692552
AUC moyenne: 0.958255153717866
----------------------------------Score moyen après prédictions---------------------------------------
Accuracy moyenne: 0.9630510882964609
Precision moyenne: 0.9712295473891164
Recall moyenne: 0.978654648347481
F1-score moyen: 0.9749151763403414
AUC moyenne: 0.9493835736201042

 [[1992  136]
 [ 108 5748]]


['../models/randomForest.joblib']

In [5]:
#Réseau de neurone




In [None]:
#XGBoost
param_grid = {
    'max_depth': [3, 5, 7], 
    'learning_rate': [0.1, 0.01], 
    'n_estimators': [100, 200]
}


grid2 = GridSearchCV(XGBClassifier(), param_grid, cv=10)
grid2.fit(X_train, y_train)

print(f"meilleur paramètres: ", grid.best_params_)
print(f"meilleur score: ", grid.best_score_)



# Créer un modèle XGBoost
model2 = grid2.best_estimator_
scores2 = cross_validate(model2,X_train, y_train, cv=10, scoring=scoring)


# Entraîner le modèle
model2.fit(X_train, y_train)
# Afficher les scores moyens d'entrainement
print("------------- Scores moyen d'entraînement -------------- \n")
print("Accuracy moyenne:", scores2['test_accuracy'].mean())
print("Precision moyenne:", scores2['test_precision'].mean())
print("Recall moyenne:", scores2['test_recall'].mean())
print("F1-score moyen:", scores2['test_f1'].mean())
print("AUC moyenne: ", scores2['test_roc_auc'].mean(),"\n")
# Faire des prédictions
y_pred2 = model2.predict(X_test)
score2_ = cross_validate(model2, X_test, y_test, cv=10, scoring=scoring)

#afficher les scores moyens après prédictions
print("--------------- Score moyen après prédictions -----------------")
print("Accuracy moyenne:", score2_['test_accuracy'].mean())
print("Precision moyenne:", score2_['test_precision'].mean())
print("Recall moyenne:", score2_['test_recall'].mean())
print("F1-score moyen:", score2_['test_f1'].mean())
print("AUC moyenne:", score2_['test_roc_auc'].mean())

#Matrice de confusion du modèle de xgboost
matr_conf = confusion_matrix(y_test, y_pred2)
print("\n", matr_conf)
joblib.dump(model2, '../models/xgboost.joblib')


meilleur paramètres:  {'max_depth': None, 'n_estimators': 100}
meilleur score:  0.9694358220883116
------------- Scores moyen avant d'entraînement -------------- 

Accuracy moyenne: 0.9678698373731989
Precision moyenne: 0.9741961096439139
Recall moyenne: 0.9823880467343005
F1-score moyen: 0.9782717693602182
AUC moyenne:  0.9548766352357454 

--------------- Score moyen après prédictions -----------------
Accuracy moyenne: 0.959671236915819
Precision moyenne: 0.9694951996227061
Recall moyenne: 0.9757533327499198
F1-score moyen: 0.9725986985214721
AUC moyenne: 0.9455810688906834

 [[1982  146]
 [  90 5766]]


['../models/xgboost.joblib']

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error, r2_score

# Créer un modèle de régression linéaire
model3 = LinearRegression()

# Validation croisée
scores3 = cross_validate(model3, X_train, y_train, cv=10, scoring={'mse': 'neg_mean_squared_error', 'r2': 'r2'})

print('Erreur quadratique moyenne (MSE):', -scores3['test_mse'].mean())
print('Coefficient de détermination (R²):', scores3['test_r2'].mean())

# Entraîner le modèle
model3.fit(X_train, y_train)

# Faire des prédictions
y_pred3 = model3.predict(X_test)

# Évaluer le modèle
mse = mean_squared_error(y_test, y_pred3)
r2 = r2_score(y_test, y_pred3)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

joblib.dump(model3, '../models/regressionLineaire.joblib')

Erreur quadratique moyenne (MSE): 0.08111904845057022
Coefficient de détermination (R²): 0.5822101752033528
Mean Squared Error: 0.08033216155873989
R-squared: 0.5890794905236905


['../models/regressionLineaire.joblib']