In [1]:
import pandas as pd

In [2]:
adult_census = pd.read_csv("data/adult-census.csv")

In [3]:
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

In [4]:
from sklearn.compose import make_column_selector as selector

In [5]:
categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)
data_categorical = data[categorical_columns]

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression

In [7]:
model = make_pipeline(
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    LogisticRegression(max_iter=500))

In [8]:
from sklearn.model_selection import cross_validate

In [9]:
cv_results = cross_validate(model, data_categorical, target)
cv_results

{'fit_time': array([0.39160371, 0.29522896, 0.38874698, 0.39025402, 0.34215617]),
 'score_time': array([0.02510691, 0.02889657, 0.02599669, 0.02719402, 0.02619791]),
 'test_score': array([0.75514382, 0.75555328, 0.75573301, 0.75307125, 0.75788288])}

In [10]:
scores = cv_results["test_score"]
print(f"The mean cross-validation accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.755 +/- 0.002


In [11]:
from sklearn.dummy import DummyClassifier

In [12]:
cv_results = cross_validate(DummyClassifier(strategy="most_frequent"),
                           data_categorical, target)
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.761 +/- 0.000


Using an arbitrary mapping from string labels to integers as done here causes the linear model to make bad assumptions on the relative ordering of categories.

This prevents the model from learning anything predictive enough and the cross-validated score is even lower than the baseline we obtained by ignoring the input data and just constantly predicting the most frequent class

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"), 
    LogisticRegression(max_iter=500))

In [15]:
cv_results = cross_validate(model, data_categorical, target)
cv_results

{'fit_time': array([0.68812156, 0.6243279 , 0.64231086, 0.71611238, 0.64629817]),
 'score_time': array([0.02393532, 0.03293538, 0.02493715, 0.02590227, 0.02393675]),
 'test_score': array([0.83222438, 0.83560242, 0.82872645, 0.83312858, 0.83466421])}

In [16]:
scores = cv_results["test_score"]
print(f"The mean cross-validation accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.833 +/- 0.002


With the linear classifier chosen, using an encoding that does not assume any ordering lead to much better result.

The important message here is: linear model and OrdinalEncoder are used together only for ordinal categorical features, features with a specific ordering. Otherwise, your model will perform poorly.