## TD 3

### GRID et RANDOM SEARCH CV

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Chargement des données
horses = pd.read_csv('data/horse_clean.csv')

# Séparation des données en train et test
X_train, X_test, y_train, y_test = train_test_split(
    horses.drop('surgical_lesion_yes', axis=1),
    horses['surgical_lesion_yes'],
    test_size=0.2, random_state=42
    )

En utilisant la fonction `GridSearchCV` de `sklearn.model_selection`, déterminer les meilleurs hyperparamètres pour les algorithmes suivants :

DecisionTreeClassifier,
RandomForestClassifier,
GradientBoostingClassifier


In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    'max_depth': (5, 15, 30, 50),
    'n_estimators': (100, 200, 300, 400, 500),
}

gridSearch = GridSearchCV(estimator=RandomForestClassifier(),
                param_grid=parameter_grid,
                cv=5,
                n_jobs=-1,
                verbose=1)

In [40]:
#visualisation des résultats
gridSearch.fit(X_train, y_train) #

print(gridSearch.best_score_) # Meilleur score
print(gridSearch.best_params_) # Meilleurs paramètres

Fitting 5 folds for each of 20 candidates, totalling 100 fits
0.7995567375886525
{'max_depth': 5, 'n_estimators': 100}


In [41]:
pd.DataFrame(gridSearch.cv_results_
             ).sort_values(by='rank_test_score'
                           ).head(10) # Résultats de la recherche

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.30125,0.077987,0.028876,0.01092,5,100,"{'max_depth': 5, 'n_estimators': 100}",0.791667,0.729167,0.8125,0.770833,0.893617,0.799557,0.05451,1
4,1.575265,0.434926,0.120599,0.053778,5,500,"{'max_depth': 5, 'n_estimators': 500}",0.8125,0.708333,0.8125,0.729167,0.893617,0.791223,0.066501,2
3,1.189761,0.256361,0.084757,0.023096,5,400,"{'max_depth': 5, 'n_estimators': 400}",0.8125,0.6875,0.8125,0.729167,0.87234,0.782801,0.06594,3
2,0.840214,0.20171,0.069009,0.014211,5,300,"{'max_depth': 5, 'n_estimators': 300}",0.8125,0.708333,0.8125,0.729167,0.851064,0.782713,0.05449,4
19,3.719037,0.781873,0.166139,0.020127,50,500,"{'max_depth': 50, 'n_estimators': 500}",0.8125,0.708333,0.8125,0.770833,0.808511,0.782535,0.040282,5
14,3.30814,1.047893,0.196605,0.061524,30,500,"{'max_depth': 30, 'n_estimators': 500}",0.8125,0.6875,0.8125,0.729167,0.851064,0.778546,0.060485,6
1,0.577597,0.150176,0.042021,0.011964,5,200,"{'max_depth': 5, 'n_estimators': 200}",0.791667,0.6875,0.8125,0.75,0.851064,0.778546,0.056014,6
16,1.488379,0.414894,0.108971,0.020404,50,200,"{'max_depth': 50, 'n_estimators': 200}",0.791667,0.708333,0.8125,0.75,0.829787,0.778457,0.044035,8
5,0.359668,0.056955,0.030913,0.014846,15,100,"{'max_depth': 15, 'n_estimators': 100}",0.791667,0.708333,0.8125,0.791667,0.787234,0.77828,0.036061,9
11,1.211508,0.25699,0.071088,0.024032,30,200,"{'max_depth': 30, 'n_estimators': 200}",0.8125,0.708333,0.8125,0.791667,0.765957,0.778191,0.038899,10


### RandomizedSearchCV

Reprendre la question précédente en utilisant la fonction `RandomizedSearchCV` de `sklearn.model_selection`.

Comparer les résultats et le temps d'exécution des deux méthodes.

In [53]:
from sklearn.model_selection import RandomizedSearchCV

parameter_grid = {
    'max_depth': (1, 5, 15, 30),
    'min_samples_split': (2, 4, 8),
    'min_samples_leaf': (1,2,3,5),
    'max_features': (None, 'sqrt', 'log2'),
    'n_estimators': (200, 300, 500),
}

gridRandom = RandomizedSearchCV(estimator=RandomForestClassifier(),
                param_distributions=parameter_grid,
                n_iter=72,
                cv=5,
                n_jobs=-1,
                verbose=1)

In [54]:
#visualisation des résultats
gridRandom.fit(X_train, y_train) #

print(gridRandom.best_score_) # Meilleur score
print(gridRandom.best_params_) # Meilleurs paramètres

Fitting 5 folds for each of 72 candidates, totalling 360 fits
0.7911347517730497
{'n_estimators': 500, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5}
RandomForestClassifier(max_depth=5, max_features='log2', min_samples_split=4,
                       n_estimators=500)
{'mean_fit_time': array([2.05676737, 3.00581741, 3.57726803, 2.25060883, 2.17444639,
       1.15558686, 1.20060291, 1.77311616, 1.02036967, 2.62432055,
       1.15729222, 1.22279119, 1.25638804, 0.90887704, 1.93578486,
       0.88248158, 0.96939626, 1.68295894, 1.92712278, 0.68778472,
       2.43667846, 0.78495345, 0.66396189, 1.01185751, 1.18545108,
       2.6329936 , 1.7437746 , 0.68036213, 1.00166745, 3.01740618,
       1.78249688, 1.33847175, 1.16724172, 2.93590546, 1.51045699,
       0.95521603, 2.55437074, 0.72223258, 1.71553726, 0.61625214,
       0.89523096, 1.81478558, 1.04721594, 1.00399861, 0.64071875,
       1.86188974, 0.83673825, 2.30283656, 0.96395454, 0.9112236 ,
   

Visualisez les diférents les scores obtenus pour les différents paramètres testés.

### Pipelines

In [85]:
horses = pd.read_csv('data/horse.csv')

# suppression des colonnes avec trop de valeurs manquantes & inutiles

horses.drop(['surgery', 'hospital_number', 'outcome', 'lesion_1',
             'lesion_2', 'lesion_3', 'cp_data'],
               axis=1, inplace=True
               )


horses.dropna(thresh=0.6*len(horses), axis=1, inplace=True)
# encodage des variables catégorielles

# suppression des lignes avec trop de valeurs manquantes
horsesf = pd.get_dummies(horses, drop_first=True)

# Séparation des données en train et test
X_train, X_test, y_train, y_test = train_test_split(
    horsesf.drop('surgical_lesion_yes', axis=1),
    horsesf['surgical_lesion_yes'],
    test_size=0.2, random_state=42)



A partir du dataframe, on va créer un pipeline qui va permettre de faire les transformations suivantes :
- Remplacer les valeurs manquantes à l'aide d'un [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)
- Entrainer un [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

Assemblez une pipeline à l'aide des fonctions çi-dessous.

In [83]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [86]:
# On crée notre pipeline
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('model', DecisionTreeClassifier())
])

On importe notre fichier initial, et on créee nos échantillons d'entrainement et de test

Entrainons notre pipeline sur l'échantillon d'entrainement

In [88]:
pipe.fit(X_train, y_train)

Prédire les valeurs de l'échantillon de test, et calculer la précision de notre modèle.

In [89]:
pipe.score(X_test, y_test)

0.6333333333333333

#### Pipeline avec GridSearchCV

On va maintenant utiliser un GridSearchCV pour trouver les meilleurs paramètres pour notre modèle.

On va utiliser les paramètres suivants :
Pour le SimpleImputer :
- strategy : ['mean', 'median', 'most_frequent']

Pour le DecisionTreeClassifier :
- max_depth : [3, 10, 20,  30]

In [90]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    'imputer__strategy': ('mean', 'median', 'most_frequent'),
    'model__max_depth': (3, 10, 20, 30),
}

gridSearch = GridSearchCV(
    estimator=pipe,
    param_grid=parameter_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
    )

gridSearch.fit(X_train, y_train)

print(gridSearch.best_score_) # Meilleur score
print(gridSearch.best_params_) # Meilleurs paramètres

Fitting 5 folds for each of 12 candidates, totalling 60 fits
0.7073581560283688
{'imputer__strategy': 'most_frequent', 'model__max_depth': 3}


### Encore plus de pipelines

On va maintenant essayer de créer une pipeline pour la totalité des transformations que l'on a faites jusqu'à présent sur nos données.

On va donc créer une pipeline qui va :
- Retirer les colonnes inutiles
- (optionel) Retirer les colonnes avec trop de valeurs manquantes
- Imputer les valeurs manquantes
- Transformer les variables catégorielles en variables numériques
    - Avec une regle pour les variables nominales
    - Avec une regle pour les variables ordinales
- Entrainer un DecisionTreeClassifier


In [164]:
import pandas as pd

# Pour faciliter la suite, on peut préciser le type de chaque colonne
horses = pd.read_csv('data/horse.csv',
            dtype= {'surgery': 'category',
                'age': 'category',
                'hospital_number': 'int64',
                'rectal_temp': 'float64',
                'pulse': 'float64',
                'respiratory_rate': 'float64',
                'temp_of_extremities': 'category',
                'peripheral_pulse': 'category',
                'mucous_membrane': 'category',
                'capillary_refill_time': 'category',
                'pain': 'category',
                'peristalsis': 'category',
                'abdominal_distention': 'category',
                'nasogastric_tube': 'category',
                'nasogastric_reflux': 'category',
                'nasogastric_reflux_ph': 'float64',
                'rectal_exam_feces': 'category',
                'abdomen': 'category',
                'packed_cell_volume': 'float64',
                'total_protein': 'float64',
                'abdomo_appearance': 'category',
                'abdomo_protein': 'float64',
                'outcome': 'category',
                'surgical_lesion': 'category',
                'lesion_1': 'category',
                'lesion_2': 'category',
                'lesion_3': 'category',
                'cp_data': 'category'
                }
            )

# echantillons
from sklearn.model_selection import train_test_split

# Séparation des données en train et test
X_train, X_test, y_train, y_test = train_test_split(
    horses.drop('surgical_lesion', axis=1),
    horses['surgical_lesion'],
    test_size=0.3,
    random_state=42
    )
print(X_train.shape)
print(X_test.shape)
y_train = pd.Series([1 if x == 'yes' else 0 for x in y_train])
y_test = pd.Series([1 if x == 'yes' else 0 for x in y_test])

(209, 27)
(90, 27)


In [185]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

class DropColumns(TransformerMixin):
    def __init__(self, columnsToDrop=[]) -> None:
        self.columnsToDrop = columnsToDrop

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(self.columnsToDrop, axis=1)
    
class DropNa(TransformerMixin):
    def __init__(self, threshold=0.6) -> None:
        self.threshold = threshold

    def fit(self, X, y=None):
        self.columns_to_drop = X.columns[X.isna().mean() > self.threshold]
        return self
    
    def transform(self, X, y=None):
        return X.drop(self.columns_to_drop, axis=1)

class Imputer_perso(TransformerMixin):
    def __init__(self) -> None:
        return None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for col in X.columns:
            if X[col].dtype in ['float64', 'int64']:
                # Remplacement des valeurs manquantes par la moyenne pour les variables numériques
                X[col] = X.loc[:,col].fillna(X[col].mean())
            else:
                # Remplacement des valeurs manquantes par le mode pour les variables catégorielles mode = valeur la plus fréquente
                X[col] = X.loc[:,col].fillna(X[col].mode()[0])
        return X
  
class EncodeOrdinal(TransformerMixin):
    def __init__(self, columnsToEncode : list, categories : list) -> None:
        self.columnsToEncode = columnsToEncode
        self.categories = categories

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # On crée une copie de X pour ne pas modifier le DataFrame original
        df = X.copy()
        dfToEncode = df[self.columnsToEncode]
        df.drop(self.columnsToEncode, axis=1, inplace=True)
        # On encode chaque colonne catégorielle avec un encodeur ordinal
        encoder = OrdinalEncoder(categories=self.categories)
        dfEncoded = encoder.fit_transform(dfToEncode)
        # On recrée un DataFrame avec les colonnes encodées
        dfEncoded = pd.DataFrame(dfEncoded, columns=self.columnsToEncode)
        # On retourne le DataFrame avec les colonnes encodées concaténé avec les colonnes ignorées
        return df.join(dfEncoded)
    
class EncodeOneHot(TransformerMixin):
    def __init__(self, columnsToEncode : str | list  = 'cat') -> None:
        self.columnsToEncode = columnsToEncode

    def fit(self, X, y=None):
        # si columnsToEncode est 'cat' on encode toutes les colonnes catégorielles
        if type(self.columnsToEncode) == str:
            if self.columnsToEncode == 'cat':
                self.columnsToEncode = X.select_dtypes(include='category').columns
            else:
                exception = f"columnsToEncode must be a list or 'cat', not {type(self.columnsToEncode)}"
                raise Exception(exception)
        return self
    
    def transform(self, X, y=None):
        # On crée une copie de X pour ne pas modifier le DataFrame original
        df = X.copy()
        dfToEncode = df[self.columnsToEncode]
        df.drop(self.columnsToEncode, axis=1, inplace=True)
        # On encode nos colonnes catégorielles avec un encodeur one-hot
        # sparse=False pour avoir un tableau NumPy et non une matrice creuse
        # set_output pour avoir un DataFrame en sortie
        OnehotEncoder = OneHotEncoder(drop='first', sparse_output=False).set_output(transform="pandas")
        dfToEncode = OnehotEncoder.fit_transform(dfToEncode)
        # On retourne le DataFrame avec les colonnes encodées concaténé avec les colonnes ignorées
        return df.join(dfToEncode)

In [182]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

columns_to_drop = ['surgery', 'hospital_number', 'outcome',
                   'lesion_1','lesion_2', 'lesion_3', 'cp_data']

tresh = 0.6

ordinalColumns = ['temp_of_extremities', 'pain']
categories = [['cold', 'cool', 'normal', 'warm'],
              ['alert', 'depressed', 'mild_pain', 'severe_pain', 'extreme_pain']]

pipeline = Pipeline([
    ('drop_columns', DropColumns(columnsToDrop = columns_to_drop)),
    ('dropNa', DropNa(threshold=tresh)),
    ('imputer', Imputer_perso()),
    ('encoderOrdinal', EncodeOrdinal(columnsToEncode=ordinalColumns, categories=categories)),
    ('encoderOneHot', EncodeOneHot(columnsToEncode='cat')),
    ('model', DecisionTreeClassifier())
])

In [183]:
pipeline.fit(X_train, y_train)

In [184]:
pipeline.score(X_test, y_test)

0.6777777777777778

avec des ColumnTransformer

In [200]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 209 entries, 224 to 102
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   surgery                209 non-null    category
 1   age                    209 non-null    category
 2   hospital_number        209 non-null    int64   
 3   rectal_temp            163 non-null    float64 
 4   pulse                  193 non-null    float64 
 5   respiratory_rate       173 non-null    float64 
 6   temp_of_extremities    166 non-null    category
 7   peripheral_pulse       159 non-null    category
 8   mucous_membrane        172 non-null    category
 9   capillary_refill_time  188 non-null    category
 10  pain                   169 non-null    category
 11  peristalsis            176 non-null    category
 12  abdominal_distention   166 non-null    category
 13  nasogastric_tube       140 non-null    category
 14  nasogastric_reflux     136 non-null    c

In [218]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

columns_to_drop = ['surgery', 'hospital_number', 'outcome',
                     'lesion_1','lesion_2', 'lesion_3', 'cp_data']
print()
tresh = 0.6

ordinalColumns = ['temp_of_extremities', 'pain']

nominalColumns = list(set(X_train.select_dtypes(
    include='category')) - set(ordinalColumns) - set(columns_to_drop))

categories = [['cold', 'cool', 'normal', 'warm'],
                ['alert', 'depressed', 'mild_pain', 'severe_pain', 'extreme_pain']]

pretraitement = Pipeline([
    ('drop_columns', DropColumns(columnsToDrop = columns_to_drop)),
    #('dropNa', DropNa(threshold=tresh)),
    ('imputer', Imputer_perso())
])

encodage = ColumnTransformer([
    ('encoderOrdinal', 
        OrdinalEncoder(categories=categories),
        ordinalColumns),
    ('encoderOneHot', 
        OneHotEncoder(drop='first', sparse=False),
        nominalColumns)
], remainder='passthrough'
) 

pipeline = Pipeline([
    ('pretraitement', pretraitement),
    ('Encodage', encodage),
    ('model', DecisionTreeClassifier())
])




In [219]:
pipeline

In [220]:
pipeline.fit(X_train, y_train)



In [221]:
pipeline.score(X_test, y_test)

0.6777777777777778