Import des librairies

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score

Fonction de lecture du fichier CSV

In [2]:
def data_read(file_name):
  path='../data/'+ file_name
  df=pd.read_csv(path,delimiter=',')
  X=df.iloc[:, :-1]
  Y=df.iloc[:, -1]
  return X,Y

Fonction de création des ensemble d'entrainement et de test

In [3]:
def data_process(X,Y,balance=None):
  test_size=0.3
  random_state=1
  if (balance=="smote"):
    from imblearn.over_sampling import SMOTE
    X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=test_size, random_state=random_state,stratify=Y)
    smote=SMOTE(sampling_strategy='auto', random_state=1)
    X_train,y_train=smote.fit_resample(X_train,y_train)
  elif (balance=="tomek"):
    X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=test_size, random_state=random_state,stratify=Y)
    from imblearn.under_sampling import TomekLinks
    tomek = TomekLinks(sampling_strategy='auto')
    X_train,y_train=tomek.fit_resample(X_train,y_train)
  else :
    X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=test_size, random_state=random_state,stratify=Y)
  return X_train,X_test,y_train,y_test

Fonction pour encoder une image quelconque en vecteur

In [4]:
def grid_search_best_model(X,Y,param_grids,balances=None):
    # Définir la balanced accuracy comme fonction scorer
    scorer = make_scorer(balanced_accuracy_score)
    returned_model=None
    best_model = None
    best_score = 0
    returned_score = 0
    for balance in balances :
      # Création des ensemble d'entrainement et de teste avec les differentes methodes de balancing
      X_train,X_test,Y_train,Y_test= data_process(X,Y,balance)
      # Executer une grid search pour chaque algorithme
      for param_grid_info in param_grids:
          model = param_grid_info['model']
          param_grid = param_grid_info['param_grid']

          grid_search = GridSearchCV(model, param_grid=param_grid, scoring=scorer, cv=3)
          grid_search.fit(X_train, Y_train)

          best_model_candidate = grid_search.best_estimator_
          best_score_candidate = grid_search.best_score_

          if best_score_candidate > best_score:
              best_model = best_model_candidate
              best_score = best_score_candidate
      if best_score>returned_score :
        returned_model=best_model
        returned_score=best_score
    # Fiter le meilleur modele sur toutes les données d'entrainement
    returned_model.fit(X_train, Y_train)
    # Evaluer le meilleur modele sur le teste set
    test_score = balanced_accuracy_score(Y_test, returned_model.predict(X_test))

    return best_model, test_score


In [None]:
# Defnir la grille de parametres pour chaque modele
param_grids = [
        {'model': SVC(), 'param_grid': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}},
        {'model': RandomForestClassifier(), 'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}},
        {'model': GradientBoostingClassifier(), 'param_grid': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}}
    ]
# Definir les methodes de balancing
datatype=[None,'smote','tomek']
X,Y=data_read("ref_data.csv")
model,score=grid_search_best_model(X,Y,param_grids,datatype)
