# **Complete preprocessing pipeline**

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    FunctionTransformer,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    MinMaxScaler,
    StandardScaler,
)
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split

import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset("mpg")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


#### **1. Elimination des na**

In [4]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

- La colonne `horsepower` est la seule à contenir des valeurs manquantes, elles sont au nombre de 6.

In [5]:
# Index des colonnes ayant des valeurs manquantes
na_index = df[df.isna().any(axis=1)].index
na_index

Index([32, 126, 330, 336, 354, 374], dtype='int64')

In [6]:
# Suppression des valeurs manquantes
df.dropna(inplace=True)

#### **2. Séparation X, y (features/target)**

In [7]:
X = df.drop(columns="mpg")
y = df[["mpg"]]

X.shape, y.shape

((392, 8), (392, 1))

#### **3. Train test split**

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Reset des index pour éviter les erreurs lors de la concaténation manuelle
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

#### **4. Extraction du constructeur automobile de la variable `name`**

- Après analyse des données, il s'avère que le 1er mot de la colonne `name` est le constructeur.
- Parfois ce 1er mot est mal orthographié (Maxda au lieu de Mazda), abrégé (VW pour Volkswagen), ou renvoie à un autre constructeur (Capri est une marque de Mercury).

In [9]:
# Liste des constructeurs
df["name"].str.split(r"[ -]").str[0].sort_values().unique()

array(['amc', 'audi', 'bmw', 'buick', 'cadillac', 'capri', 'chevroelt',
       'chevrolet', 'chevy', 'chrysler', 'datsun', 'dodge', 'fiat',
       'ford', 'hi', 'honda', 'maxda', 'mazda', 'mercedes', 'mercury',
       'nissan', 'oldsmobile', 'opel', 'peugeot', 'plymouth', 'pontiac',
       'renault', 'saab', 'subaru', 'toyota', 'toyouta', 'triumph',
       'vokswagen', 'volkswagen', 'volvo', 'vw'], dtype=object)

- hi: International Harvester (I guess???)

In [10]:
def GetConstructor(col):
    X = col.copy()

    if X.shape[1] != 1:
        raise ValueError("Input must has a single column (name)")

    # Le 1er mot de la colonne name est le constructeur
    X["constructor"] = X["name"].str.split().str[0]

    # Suppression de la colonne name
    X = X.drop(columns="name")

    # Quelques ajustements sont nécessaires
    constructor_mapping = {
        "capri": "mercury",
        "chevroelt": "chevrolet",
        "chevy": "chevrolet",
        "maxda": "mazda",
        "mercedes-benz": "mercedes",
        "toyouta": "toyota",
        "vokswagen": "volkswagen",
        "vw": "volkswagen",
    }

    X["constructor"] = X["constructor"].replace(constructor_mapping)

    # Liste des constructeurs connus
    constructors = [
        "amc",
        "audi",
        "bmw",
        "buick",
        "cadillac",
        "chevrolet",
        "chrysler",
        "datsun",
        "dodge",
        "fiat",
        "ford",
        "hi",
        "honda",
        "mazda",
        "mercedes",
        "mercury",
        "nissan",
        "oldsmobile",
        "opel",
        "peugeot",
        "plymouth",
        "pontiac",
        "renault",
        "saab",
        "subaru",
        "toyota",
        "triumph",
        "volkswagen",
        "volvo",
    ]

    # Les constructeurs inconnus sont remplacés par "unknown"
    X.loc[~X["constructor"].isin(constructors), "constructor"] = "unknown"

    return X


# Il faut expliciter le nom des colonnes en output
def output_feature_names(transformer, input_features):
    return ["constructor"]


# Conversion de la fonction en transformer pour le pipeline
GetConstructor = FunctionTransformer(
    GetConstructor, feature_names_out=output_feature_names
)

In [11]:
# Testons le transformer et reconstituons le dataset
X_train_constructor = GetConstructor.transform(X_train[["name"]])

# Suppression de la colonne "name"
X_train.drop(columns="name", inplace=True)

# Concaténation des données
X_train = pd.concat([X_train_constructor, X_train], axis=1)

X_train.head()

Unnamed: 0,constructor,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,toyota,4,134.0,95.0,2560,14.2,78,japan
1,amc,6,199.0,97.0,2774,15.5,70,usa
2,mercury,8,429.0,208.0,4633,11.0,72,usa
3,buick,6,231.0,110.0,3907,21.0,75,usa
4,plymouth,8,440.0,215.0,4312,8.5,70,usa


#### **5. Encodage des catégories**

- Après le remplacement du nom de modèle par le constructeur, les variables catégorielles sont :
  - `origin` : 3 catégories, nous utiliseront un OneHotEncoder,
  - `constructor` : 30 catégories, nous utiliserons un OrdinalEncoder.

In [12]:
# Liste des origines connues
origins = ["europe", "japan", "usa"]

# Origin: OneHotEncoder
origin_encoder = OneHotEncoder(
    categories=[origins], drop="first", sparse_output=False, handle_unknown="error"
)
origin_encoded = origin_encoder.fit_transform(X_train[["origin"]])
origin_encoded = pd.DataFrame(
    origin_encoded, columns=origin_encoder.get_feature_names_out()
)
origin_encoded.head()

Unnamed: 0,origin_japan,origin_usa
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [13]:
# Liste des constructeurs connus
constructors = [
    "amc",
    "audi",
    "bmw",
    "buick",
    "cadillac",
    "chevrolet",
    "chrysler",
    "datsun",
    "dodge",
    "fiat",
    "ford",
    "hi",
    "honda",
    "mazda",
    "mercedes",
    "mercury",
    "nissan",
    "oldsmobile",
    "opel",
    "peugeot",
    "plymouth",
    "pontiac",
    "renault",
    "saab",
    "subaru",
    "toyota",
    "triumph",
    "volkswagen",
    "volvo",
    "unknown",  # le "use_encoded_value" implicite du "handle_unknown"
]

# Constructor: OrdinalEncoder
constructor_encoder = OrdinalEncoder(categories=[constructors], handle_unknown="error")
constructor_encoded = constructor_encoder.fit_transform(X_train[["constructor"]])
constructor_encoded = pd.DataFrame(
    constructor_encoded, columns=constructor_encoder.get_feature_names_out()
)
constructor_encoded.head()

Unnamed: 0,constructor
0,25.0
1,0.0
2,15.0
3,3.0
4,20.0


In [14]:
# Remplacement des colonnes
del X_train["origin"], X_train["constructor"]

X_train = pd.concat([X_train, origin_encoded, constructor_encoded], axis=1)
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_japan,origin_usa,constructor
0,4,134.0,95.0,2560,14.2,78,1.0,0.0,25.0
1,6,199.0,97.0,2774,15.5,70,0.0,1.0,0.0
2,8,429.0,208.0,4633,11.0,72,0.0,1.0,15.0
3,6,231.0,110.0,3907,21.0,75,0.0,1.0,3.0
4,8,440.0,215.0,4312,8.5,70,0.0,1.0,20.0


#### **6. PolynomialFeatures de degré 2 sur les variables `horsepower`, `weight` et `acceleration`**

In [15]:
polynomial_transformer = PolynomialFeatures(degree=2, include_bias=False)
polynomial_features = polynomial_transformer.fit_transform(
    X_train[["horsepower", "weight", "acceleration"]]
)
polynomial_features = pd.DataFrame(
    polynomial_features, columns=polynomial_transformer.get_feature_names_out()
)
polynomial_features.head()

Unnamed: 0,horsepower,weight,acceleration,horsepower^2,horsepower weight,horsepower acceleration,weight^2,weight acceleration,acceleration^2
0,95.0,2560.0,14.2,9025.0,243200.0,1349.0,6553600.0,36352.0,201.64
1,97.0,2774.0,15.5,9409.0,269078.0,1503.5,7695076.0,42997.0,240.25
2,208.0,4633.0,11.0,43264.0,963664.0,2288.0,21464689.0,50963.0,121.0
3,110.0,3907.0,21.0,12100.0,429770.0,2310.0,15264649.0,82047.0,441.0
4,215.0,4312.0,8.5,46225.0,927080.0,1827.5,18593344.0,36652.0,72.25


In [16]:
# Remplacement des colonnes
del X_train["horsepower"], X_train["weight"], X_train["acceleration"]

X_train = pd.concat([X_train, polynomial_features], axis=1)
X_train.head()

Unnamed: 0,cylinders,displacement,model_year,origin_japan,origin_usa,constructor,horsepower,weight,acceleration,horsepower^2,horsepower weight,horsepower acceleration,weight^2,weight acceleration,acceleration^2
0,4,134.0,78,1.0,0.0,25.0,95.0,2560.0,14.2,9025.0,243200.0,1349.0,6553600.0,36352.0,201.64
1,6,199.0,70,0.0,1.0,0.0,97.0,2774.0,15.5,9409.0,269078.0,1503.5,7695076.0,42997.0,240.25
2,8,429.0,72,0.0,1.0,15.0,208.0,4633.0,11.0,43264.0,963664.0,2288.0,21464689.0,50963.0,121.0
3,6,231.0,75,0.0,1.0,3.0,110.0,3907.0,21.0,12100.0,429770.0,2310.0,15264649.0,82047.0,441.0
4,8,440.0,70,0.0,1.0,20.0,215.0,4312.0,8.5,46225.0,927080.0,1827.5,18593344.0,36652.0,72.25


#### **7. Normalisation des variables**

##### **7.1 `cylinders`, `model_year`, et `constructor` avec MinMaxScaler**

In [17]:
min_max_scaler = MinMaxScaler()
min_max_scaled = min_max_scaler.fit_transform(
    X_train[["cylinders", "model_year", "constructor"]]
)
min_max_scaled = pd.DataFrame(
    min_max_scaled, columns=min_max_scaler.get_feature_names_out()
)
min_max_scaled.head()

Unnamed: 0,cylinders,model_year,constructor
0,0.2,0.666667,0.892857
1,0.6,0.0,0.0
2,1.0,0.166667,0.535714
3,0.6,0.416667,0.107143
4,1.0,0.0,0.714286


##### **7.2 Les autres variables (sauf `origin_japan` et `origin_usa`) avec StandardScaler**

Ces deux variables sont déjà binaires, donc à une échelle min max convenable.

In [18]:
standard_scaler = StandardScaler()
standard_scaled = standard_scaler.fit_transform(
    X_train.drop(
        columns=["cylinders", "model_year", "constructor", "origin_japan", "origin_usa"]
    )
)
standard_scaled = pd.DataFrame(
    standard_scaled, columns=standard_scaler.get_feature_names_out()
)
standard_scaled.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,horsepower^2,horsepower weight,horsepower acceleration,weight^2,weight acceleration,acceleration^2
0,-0.588476,-0.245826,-0.490076,-0.458303,-0.345994,-0.437353,-0.498041,-0.553376,-0.720352,-0.512419
1,0.033663,-0.193925,-0.235417,-0.002504,-0.306622,-0.318273,-0.103278,-0.341716,-0.173384,-0.091169
2,2.235078,2.68661,1.976787,-1.58027,3.164626,2.877935,1.901196,2.211536,0.48232,-1.392234
3,0.339946,0.143435,1.112849,1.925875,-0.030706,0.421168,1.957408,1.061884,3.040931,2.099093
4,2.340363,2.868265,1.594798,-2.456806,3.468226,2.70959,0.724573,1.679112,-0.695658,-1.924116


In [19]:
X_train = pd.concat(
    [X_train[["origin_japan", "origin_usa"]], min_max_scaled, standard_scaled], axis=1
)

#### **8. Selection des 10 variables les plus corrélées à notre target `mpg` à l'aide de SelectKBest**

In [20]:
features_selector = SelectKBest(score_func=f_regression, k=10)
selected_features = features_selector.fit_transform(
    X_train, y_train.iloc[:, 0]
)  # y doit être un 1d array
X_train = pd.DataFrame(
    selected_features, columns=features_selector.get_feature_names_out()
)
X_train.head()

Unnamed: 0,origin_usa,cylinders,model_year,displacement,horsepower,weight,horsepower^2,horsepower weight,horsepower acceleration,weight^2
0,0.0,0.2,0.666667,-0.588476,-0.245826,-0.490076,-0.345994,-0.437353,-0.498041,-0.553376
1,1.0,0.6,0.0,0.033663,-0.193925,-0.235417,-0.306622,-0.318273,-0.103278,-0.341716
2,1.0,1.0,0.166667,2.235078,2.68661,1.976787,3.164626,2.877935,1.901196,2.211536
3,1.0,0.6,0.416667,0.339946,0.143435,1.112849,-0.030706,0.421168,1.957408,1.061884
4,1.0,1.0,0.0,2.340363,2.868265,1.594798,3.468226,2.70959,0.724573,1.679112


In [21]:
# Scores et p-valeurs des variables sélectionnées par le test f_regression de Fisher
selection_report = pd.DataFrame(
    {
        "column": features_selector.feature_names_in_,
        "score": features_selector.scores_,
        "p_value": features_selector.pvalues_,
        "selected": features_selector.get_support(),
    }
).sort_values("score", ascending=False)

selection_report

Unnamed: 0,column,score,p_value,selected
7,weight,667.593038,2.0333029999999998e-77,True
5,displacement,576.499649,4.391824e-71,True
12,weight^2,547.785819,6.000144000000001e-69,True
11,horsepower acceleration,501.036945,2.6108919999999997e-65,True
10,horsepower weight,496.223536,6.3618289999999995e-65,True
2,cylinders,466.749863,1.6786560000000002e-62,True
6,horsepower,462.760784,3.6300330000000003e-62,True
9,horsepower^2,310.62236,7.424995999999999e-48,True
3,model_year,155.719627,6.229907e-29,True
1,origin_usa,145.996907,1.5691270000000001e-27,True


- Le poids (`weight`), la cylindrée (`displacement`), le nombre de cylindres (`cylinders`) et la puissance fiscale (`horse power`) sont les variables les plus correlées à notre target `mpg`.
- Les contributions (en termes de variance expliquée) de `acceleration` et de `constructor`, la variable créée à partir de `name`, sont minimes comparées à celles des autres variables.

#### **9. Représentation du pipeline**

- Etant donné les multiples transformations effectués, nous avons jugé nécessaire de partir des variables individuelles et d'appliquer les transformations affectant différents sous-groupes au fur et à mesure (un peu à la manière d'un clustering hiérarchique ascendant).
- Ceci nous permet d'avoir une meilleure vision globale de notre preprocessing.

![Complete preprocessing pipeline](exercice-5-complete-preprocessing-pipeline.png)

#### **10. Création du pipeline sklearn (pas simple !)**

##### **10.1 Pipeline 1 (gauche)**

In [22]:
# Liste des constructeurs connus
constructors = [
    "amc",
    "audi",
    "bmw",
    "buick",
    "cadillac",
    "chevrolet",
    "chrysler",
    "datsun",
    "dodge",
    "fiat",
    "ford",
    "hi",
    "honda",
    "mazda",
    "mercedes",
    "mercury",
    "nissan",
    "oldsmobile",
    "opel",
    "peugeot",
    "plymouth",
    "pontiac",
    "renault",
    "saab",
    "subaru",
    "toyota",
    "triumph",
    "volkswagen",
    "volvo",
    "unknown",  # le "use_encoded_value" implicite du "handle_unknown"
]

# Traitement de "name" pour obtenir "constructor"
name_pipeline = Pipeline(
    steps=[
        ("GetConstructor", GetConstructor),
        (
            "OrdinalEncoder",
            OrdinalEncoder(categories=[constructors], handle_unknown="error"),
        ),
    ]
)

name_transformer = ColumnTransformer(
    transformers=[
        ("NamePipeline", name_pipeline, ["name"]),
        ("Passthrough", "passthrough", ["model_year", "cylinders"]),
    ]
)

# Pipeline entier pour "model_year", "cylinders" et "constructor" (mcc)
mcc_pipeline = Pipeline(
    steps=[
        ("NameTransformer", name_transformer),
        ("MinMaxScaler", MinMaxScaler()),
    ]
)

##### **10.2 Pipeline 2 (centre)**

Directement intégré dans le pipeline final.

##### **10.3 Pipeline 3 (droite)**

In [23]:
# Traitement de "horsepower", "weight" et "acceleration" (hwa)
hwa_transformer = ColumnTransformer(
    transformers=[
        (
            "PolynomialFeatures",
            PolynomialFeatures(degree=2, include_bias=False),
            ["horsepower", "weight", "acceleration"],
        ),
        ("Passthrough", "passthrough", ["displacement"]),
    ]
)

# Pipeline entier pour "horsepower", "weight", "acceleration" et "displacement" (hwad)
hwad_pipeline = Pipeline(
    steps=[
        ("HWATransformer", hwa_transformer),
        ("StandardScaler", StandardScaler()),
    ]
)

##### **10.4 Pipeline final**

In [24]:
# Liste des origines connues
origins = ["europe", "japan", "usa"]

# Transformations de colonnes
column_transformer = ColumnTransformer(
    transformers=[
        # model_year, cylinders, constructor (mcc)
        ("MinMaxScaler", mcc_pipeline, ["model_year", "cylinders", "name"]),
        # origin
        (
            "OneHotEncoder",
            OneHotEncoder(
                categories=[origins],
                drop="first",
                sparse_output=False,
                handle_unknown="error",
            ),
            ["origin"],
        ),
        # horsepower, weight, acceleration, displacement (hwad)
        (
            "StandardScaler",
            hwad_pipeline,
            ["horsepower", "weight", "acceleration", "displacement"],
        ),
    ]
)

# Pipeline entière
preprocessor = Pipeline(
    steps=[
        ("ColumnTransformer", column_transformer),
        ("SelectKBest", SelectKBest(score_func=f_regression, k=10)),
    ]
)
preprocessor

##### **10.5 Test**

In [25]:
# Re-split des données pour appliquer les transformations via le pipeline et comparer
# l'output aux résultats obtenus lors de l'exécution séquentielle des transformations
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, random_state=42)

# Données transformées
X_train_2 = preprocessor.fit_transform(X_train_2, y_train_2.iloc[:, 0])

# Colonnes
preprocessor.get_feature_names_out()

array(['MinMaxScaler__Passthrough__model_year',
       'MinMaxScaler__Passthrough__cylinders',
       'OneHotEncoder__origin_usa',
       'StandardScaler__PolynomialFeatures__horsepower',
       'StandardScaler__PolynomialFeatures__weight',
       'StandardScaler__PolynomialFeatures__horsepower^2',
       'StandardScaler__PolynomialFeatures__horsepower weight',
       'StandardScaler__PolynomialFeatures__horsepower acceleration',
       'StandardScaler__PolynomialFeatures__weight^2',
       'StandardScaler__Passthrough__displacement'], dtype=object)

In [26]:
# Noms des colonnes (des plus corrélées aux moins corrélées à la variable cible)
cols = selection_report["column"][:10]

# Noms des colonnes issues des transformations manuelles
X_train_cols = X_train.columns

# Noms des colonnes issues des transformations dans le pipeline
X_train_2_cols = [col.split("__")[-1] for col in preprocessor.get_feature_names_out()]

# Vérifions l'égalité des noms de colonnes
print({*cols} == {*X_train_cols})
print({*cols} == {*X_train_2_cols})

True
True


In [27]:
# Création d'un DataFrame pour enregistrer les résultats du pipeline
X_train_2 = pd.DataFrame(X_train_2, columns=X_train_2_cols)

# Colonnes dans le même ordre que dans X_train (exécution manuelle)
X_train_2 = X_train_2[X_train_cols]

In [28]:
# Les 2 datasets sont égaux.
# En effet, les quelques différences observées sont négligeables (< 10^-14) et sont
# dues à l'imprécision de la représentation des nombres décimaux en informatique.
((X_train - X_train_2) > 1e-14).sum()

origin_usa                 0
cylinders                  0
model_year                 0
displacement               0
horsepower                 0
weight                     0
horsepower^2               0
horsepower weight          0
horsepower acceleration    0
weight^2                   0
dtype: int64

In [29]:
# Tranformation du jeu de test
X_test_2 = preprocessor.transform(X_test_2)
X_test_2 = pd.DataFrame(X_test_2, columns=X_train_2_cols)
X_test_2.head()

Unnamed: 0,model_year,cylinders,origin_usa,horsepower,weight,horsepower^2,horsepower weight,horsepower acceleration,weight^2,displacement
0,0.166667,0.2,0.0,-0.920546,-0.931565,-0.783194,-0.861432,-0.771436,-0.880076,-0.952189
1,0.666667,0.2,0.0,0.273189,-0.210427,0.084644,-0.077393,0.668361,-0.320031,-0.712904
2,0.666667,0.2,0.0,-1.154103,-1.394474,-0.902235,-1.059489,-1.430651,-1.167806,-1.000045
3,0.083333,0.2,1.0,-0.894595,-1.210024,-0.768942,-0.926733,-0.278302,-1.059883,-1.000045
4,1.0,0.2,1.0,-0.479383,-0.216377,-0.51302,-0.452354,-0.516948,-0.325209,-0.531048
