In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import SGDClassifier

# 1. SimpleImputer

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
X = np.array([[10,3],
            [0,4],
            [5,3],
            [np.nan, 3]])

In [None]:
imputer = SimpleImputer(missing_values=np.nan,
             strategy='mean')
imputer.fit_transform(X)

In [None]:
X_test = np.array([[12,5],
            [40,2],
            [5,5],
            [np.nan, np.nan]])

In [None]:
imputer.transform(X_test)

il utilise la moyenne calculée sur le train_set car il ne faut pas utiliser d'informations sur le test_set pour entrainer le modèle

# 2. KNNImputer

In [None]:
from sklearn.impute import KNNImputer

In [None]:
X = np.array([[1, 100],
             [2, 30],
             [3, 15],
             [np.nan, 20]])

In [None]:
imputer = KNNImputer(n_neighbors=1)
imputer.fit_transform(X)

# 3. MissingIndicator

In [None]:
from sklearn.impute import MissingIndicator
from sklearn.pipeline import make_union

In [None]:
X = np.array([[1, 100],
             [2, 30],
             [3, 15],
             [np.nan, np.nan]])

In [None]:
MissingIndicator().fit_transform(X)

Ce manque d'information est une information en soit, on pourrait avoir envie de visualiser dans une nouvelle colonne les individus pour lesquels il manque telle ou telle information.

In [None]:
pipeline = make_union(SimpleImputer(strategy='constant', fill_value=-99), MissingIndicator())

In [None]:
pipeline.fit_transform(X)

# 4. Application

Un avantage de cette solution par rapport à des fonctions pandas est qu'on va pouvoir utiliser GridSearchCV pour optimiser les paramètres de l'Imputer.

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
import seaborn as sns

In [None]:
titanic = sns.load_dataset('titanic')
titanic.head()

In [None]:
X = titanic[['pclass', 'age']]
y = titanic['survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
model = make_pipeline(KNNImputer(), SGDClassifier())

In [None]:
params = {
    'knnimputer__n_neighbors': [1, 2, 3, 4]
}

In [None]:
grid = GridSearchCV(model, param_grid=params, cv = 5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
imputer = KNNImputer(n_neighbors=3)
imputer.fit_transform(X_train)

In [None]:
X_train