In [None]:
from scipy.io import arff
import pandas as pd

# Chargement dufichier ARFF
data = arff.loadarff('phpMawTba.arff')

# Conversion des données en DataFrame pandas
adult_census = pd.DataFrame(data[0])

# Affichage du DataFrame
print(adult_census.head())

    age     workclass    fnlwgt        education  education-num  \
0  25.0    b'Private'  226802.0          b'11th'            7.0   
1  38.0    b'Private'   89814.0       b'HS-grad'            9.0   
2  28.0  b'Local-gov'  336951.0    b'Assoc-acdm'           12.0   
3  44.0    b'Private'  160323.0  b'Some-college'           10.0   
4  18.0          b'?'  103497.0  b'Some-college'           10.0   

          marital-status            occupation  relationship      race  \
0       b'Never-married'  b'Machine-op-inspct'  b'Own-child'  b'Black'   
1  b'Married-civ-spouse'    b'Farming-fishing'    b'Husband'  b'White'   
2  b'Married-civ-spouse'    b'Protective-serv'    b'Husband'  b'White'   
3  b'Married-civ-spouse'  b'Machine-op-inspct'    b'Husband'  b'Black'   
4       b'Never-married'                  b'?'  b'Own-child'  b'White'   

         sex  capital-gain  capital-loss  hours-per-week    native-country  \
0    b'Male'           0.0           0.0            40.0  b'United-States'

In [None]:
#Comme nous avons vu dans le notebook module 1 Machine Learning with scikit learn, la colonne education_num est redondante, nous la supprimons.
adult_census = adult_census.drop(columns="education-num")

target_name = "class"
target = adult_census[target_name]

# méthode replace pour convertir les étiquettes
target.replace({b'<=50K': 0, b'>50K': 1}, inplace=True)
print(target.head())

data = adult_census.drop(columns=[target_name])

0    0
1    0
2    1
3    1
4    0
Name: class, dtype: int64


Nous allons afficher une serie qui donne le nombre d'occurrences de chaque valeur unique dans la colonne "native-country" du DataFrame data, triées par ordre alphabétique ou numérique.

In [None]:
data["native-country"].value_counts().sort_index()

b'?'                               857
b'Cambodia'                         28
b'Canada'                          182
b'China'                           122
b'Columbia'                         85
b'Cuba'                            138
b'Dominican-Republic'              103
b'Ecuador'                          45
b'El-Salvador'                     155
b'England'                         127
b'France'                           38
b'Germany'                         206
b'Greece'                           49
b'Guatemala'                        88
b'Haiti'                            75
b'Holand-Netherlands'                1
b'Honduras'                         20
b'Hong'                             30
b'Hungary'                          19
b'India'                           151
b'Iran'                             59
b'Ireland'                          37
b'Italy'                           105
b'Jamaica'                         106
b'Japan'                            92
b'Laos'                  

In [None]:
#Type par colonne
data.dtypes

age               float64
workclass          object
fnlwgt            float64
education          object
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain      float64
capital-loss      float64
hours-per-week    float64
native-country     object
dtype: object

In [None]:
#Nous sélectionnons les colonnes catégorielles:
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)
data_categorical = data[categorical_columns]
data_categorical.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,b'Private',b'11th',b'Never-married',b'Machine-op-inspct',b'Own-child',b'Black',b'Male',b'United-States'
1,b'Private',b'HS-grad',b'Married-civ-spouse',b'Farming-fishing',b'Husband',b'White',b'Male',b'United-States'
2,b'Local-gov',b'Assoc-acdm',b'Married-civ-spouse',b'Protective-serv',b'Husband',b'White',b'Male',b'United-States'
3,b'Private',b'Some-college',b'Married-civ-spouse',b'Machine-op-inspct',b'Husband',b'Black',b'Male',b'United-States'
4,b'?',b'Some-college',b'Never-married',b'?',b'Own-child',b'White',b'Female',b'United-States'


In [None]:
print(f"The dataset is composed of {data_categorical.shape[1]} features")

The dataset is composed of 8 features


**ENCODAGE **


La stratégie la plus intuitive consiste à coder chaque catégorie avec un numéro différent. L'OrdinalEncoder transforme les données de cette manière. Nous commençons par encoder une seule colonne pour comprendre comment fonctionne l’encodage

In [None]:
from sklearn.preprocessing import OrdinalEncoder

education_column = data_categorical[["education"]]

encoder = OrdinalEncoder().set_output(transform="pandas")
education_encoded = encoder.fit_transform(education_column)
education_encoded

Unnamed: 0,education
0,1.0
1,11.0
2,7.0
3,15.0
4,15.0
...,...
48837,7.0
48838,11.0
48839,11.0
48840,11.0


In [None]:
#Nous pourrions vérifier le mappage entre les catégories et les valeurs numériques en vérifiant l'attribut ajusté category_
encoder.categories_

[array([b'10th', b'11th', b'12th', b'1st-4th', b'5th-6th', b'7th-8th',
        b'9th', b'Assoc-acdm', b'Assoc-voc', b'Bachelors', b'Doctorate',
        b'HS-grad', b'Masters', b'Preschool', b'Prof-school',
        b'Some-college'], dtype=object)]

In [None]:
#Encodage de toutes les colonnes catégorielles
data_encoded = encoder.fit_transform(data_categorical)
data_encoded[:5]
print(f"The dataset encoded contains {data_encoded.shape[1]} features")

The dataset encoded contains 8 features


Encodage des catégories nominales (sans supposer aucun ordre):
OneHotEncoder

In [None]:
#Encodage pour démonstration avec la colonne education
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
education_encoded = encoder.fit_transform(education_column)
education_encoded

Unnamed: 0,education_b'10th',education_b'11th',education_b'12th',education_b'1st-4th',education_b'5th-6th',education_b'7th-8th',education_b'9th',education_b'Assoc-acdm',education_b'Assoc-voc',education_b'Bachelors',education_b'Doctorate',education_b'HS-grad',education_b'Masters',education_b'Preschool',education_b'Prof-school',education_b'Some-college'
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
print(f"The dataset is composed of {data_categorical.shape[1]} features")
data_categorical.head()

The dataset is composed of 8 features


Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,b'Private',b'11th',b'Never-married',b'Machine-op-inspct',b'Own-child',b'Black',b'Male',b'United-States'
1,b'Private',b'HS-grad',b'Married-civ-spouse',b'Farming-fishing',b'Husband',b'White',b'Male',b'United-States'
2,b'Local-gov',b'Assoc-acdm',b'Married-civ-spouse',b'Protective-serv',b'Husband',b'White',b'Male',b'United-States'
3,b'Private',b'Some-college',b'Married-civ-spouse',b'Machine-op-inspct',b'Husband',b'Black',b'Male',b'United-States'
4,b'?',b'Some-college',b'Never-married',b'?',b'Own-child',b'White',b'Female',b'United-States'


In [None]:
#Encodage sur l'ensemble des datas
data_encoded = encoder.fit_transform(data_categorical)
data_encoded[:5]

Unnamed: 0,workclass_b'?',workclass_b'Federal-gov',workclass_b'Local-gov',workclass_b'Never-worked',workclass_b'Private',workclass_b'Self-emp-inc',workclass_b'Self-emp-not-inc',workclass_b'State-gov',workclass_b'Without-pay',education_b'10th',...,native-country_b'Portugal',native-country_b'Puerto-Rico',native-country_b'Scotland',native-country_b'South',native-country_b'Taiwan',native-country_b'Thailand',native-country_b'Trinadad&Tobago',native-country_b'United-States',native-country_b'Vietnam',native-country_b'Yugoslavia'
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
print(f"The encoded dataset contains {data_encoded.shape[1]} features")

The encoded dataset contains 102 features


Stratégie de choix de l'encodeur : En général, OneHotEncoder est la stratégie d'encodage utilisée lorsque les modèles en aval sont des modèles linéaires, tandis qu'OrdinalEncoder est souvent une bonne stratégie avec des modèles arborescents.

integrate this encoder inside a machine learning pipeline

In [None]:
#Quelques stats pour la colonne "native-country"
data["native-country"].value_counts()

b'United-States'                 43832
b'Mexico'                          951
b'?'                               857
b'Philippines'                     295
b'Germany'                         206
b'Puerto-Rico'                     184
b'Canada'                          182
b'El-Salvador'                     155
b'India'                           151
b'Cuba'                            138
b'England'                         127
b'China'                           122
b'South'                           115
b'Jamaica'                         106
b'Italy'                           105
b'Dominican-Republic'              103
b'Japan'                            92
b'Guatemala'                        88
b'Poland'                           87
b'Vietnam'                          86
b'Columbia'                         85
b'Haiti'                            75
b'Portugal'                         67
b'Taiwan'                           65
b'Iran'                             59
b'Greece'                

On constate que la catégorie « Hollande-Pays-Bas » est rare. Cela posera un problème lors de la validation croisée : si l'échantillon se retrouve dans l'ensemble de test lors du fractionnement, alors le classificateur n'aura pas vu la catégorie pendant la formation et ne pourra pas la coder.

Dans scikit-learn, il existe quelques solutions possibles pour contourner ce problème :

répertorier toutes les catégories possibles et les fournir à l'encodeur via le mot-clé argument catégories au lieu de laisser l'estimateur les déterminer automatiquement à partir des données d'entraînement lors de l'appel de fit ;
définissez le paramètre handle_unknown="ignore", c'est-à-dire que si une catégorie inconnue est rencontrée lors de la transformation, les colonnes codées à chaud résultantes pour cette fonctionnalité seront toutes des zéros ;
ajustez le paramètre min_ Frequency pour regrouper les catégories les plus rares observées dans les données d'entraînement en une seule fonctionnalité codée à chaud. Si vous activez cette option, vous pouvez également définir handle_unknown="infrequent_if_exist" pour coder les catégories inconnues (catégories observées uniquement au moment de la prédiction) comme celles de cette dernière colonne.
Dans ce notebook, nous explorons uniquement la deuxième option, à savoir OneHotEncoder(handle_unknown="ignore").

In [None]:
#Machine learning Pipeline :
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"), LogisticRegression(max_iter=500)
)

In [None]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data_categorical, target)
cv_results

{'fit_time': array([0.82200098, 0.74235368, 0.81101537, 0.75018239, 0.71703768]),
 'score_time': array([0.02329087, 0.02272391, 0.02424455, 0.02586508, 0.0228045 ]),
 'test_score': array([0.83222438, 0.83560242, 0.82872645, 0.83312858, 0.83466421])}

In [None]:
scores = cv_results["test_score"]
print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}")

The accuracy is: 0.833 ± 0.002
