## Using numerical and categorical variables together

In [1]:
import pandas as pd

In [3]:
adult_census = pd.read_csv("data/adult-census.csv")
# drop the duplicated column `"education-num"` as stated in the first notebook
adult_census.drop(columns="education-num", inplace=True)

target_name = "class"
target = adult_census[target_name]

data = adult_census.drop(columns=[target_name])

## Selection based on data types

In [5]:
from sklearn.compose import make_column_selector as selector

In [6]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [7]:
numerical_columns

['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

In [8]:
categorical_columns

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

## Dispatch columns to a specific processor

In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [10]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [11]:
from sklearn.compose import ColumnTransformer

In [12]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard-scaler', numerical_preprocessor, numerical_columns)])

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [14]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

In [16]:
from sklearn import set_config
set_config(display='diagram')
model

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=42)

In [19]:
_ = model.fit(data_train, target_train)

In [20]:
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
7762,56,Private,33115,HS-grad,Divorced,Other-service,Unmarried,White,Female,0,0,40,United-States
23881,25,Private,112847,HS-grad,Married-civ-spouse,Transport-moving,Own-child,Other,Male,0,0,40,United-States
30507,43,Private,170525,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female,14344,0,40,United-States
28911,32,Private,186788,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States
19484,39,Private,277886,Bachelors,Married-civ-spouse,Sales,Wife,White,Female,0,0,30,United-States


In [21]:
model.predict(data_test)[:5]

array([' <=50K', ' <=50K', ' >50K', ' <=50K', ' >50K'], dtype=object)

In [22]:
target_test[:5]

7762      <=50K
23881     <=50K
30507      >50K
28911     <=50K
19484     <=50K
Name: class, dtype: object

In [23]:
model.score(data_test, target_test)

0.8582425681762346

## Evaluation of the model with cross-validation

In [24]:
from sklearn.model_selection import cross_validate

In [25]:
cv_results = cross_validate(model, data, target, cv=5)
cv_results

{'fit_time': array([0.93444324, 0.8866179 , 0.86366129, 0.90995455, 0.91606927]),
 'score_time': array([0.03691268, 0.03134608, 0.03199244, 0.03401852, 0.03193045]),
 'test_score': array([0.85228785, 0.85105947, 0.84930385, 0.85257985, 0.85667486])}

In [26]:
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.852 +/- 0.002


## Fitting a more powerful model

In [27]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

In [28]:
categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")
model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

In [29]:
%%time
_ = model.fit(data_train, target_train)

Wall time: 2.04 s


In [30]:
model.score(data_test, target_test)

0.8796167390058144

In this notebook we:
<ul>
<li>used a ColumnTransformer to apply different preprocessing for categorical and numerical variables;</li>
<li>used a pipeline to chain the ColumnTransformer preprocessing and logistic regression fitting;</li>
    <li>seen that gradient boosting methods can outperform linear models.</li>
</ul>