In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
adult_census = pd.read_csv('data/adult-census.csv')

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42)

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

In [4]:
categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough', sparse_threshold=0)

In [5]:
# This line is currently required to import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

In [6]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42))
])

## Personal Work

In [7]:
from sklearn.model_selection import cross_val_score

In [8]:
learning_rate_values = [0.01, 0.1, 1, 10]
max_leaf_nodes_values = [3, 10, 30]

In [9]:
for rate in learning_rate_values:
    for value in max_leaf_nodes_values:
        model.set_params(classifier__learning_rate=rate)
        model.set_params(classifier__max_leaf_nodes=value)
        scores = cross_val_score(model, data_train, target_train, cv=2)
        print(f"Accuracy score via cross-validation with learning_rate={rate} and max_leaf_nodes={value}:\n"
              f"{scores.mean():.3f} +/- {scores.std():.3f}")

Accuracy score via cross-validation with learning_rate=0.01 and max_leaf_nodes=3:
0.789 +/- 0.001
Accuracy score via cross-validation with learning_rate=0.01 and max_leaf_nodes=10:
0.813 +/- 0.002
Accuracy score via cross-validation with learning_rate=0.01 and max_leaf_nodes=30:
0.842 +/- 0.001
Accuracy score via cross-validation with learning_rate=0.1 and max_leaf_nodes=3:
0.847 +/- 0.002
Accuracy score via cross-validation with learning_rate=0.1 and max_leaf_nodes=10:
0.859 +/- 0.001
Accuracy score via cross-validation with learning_rate=0.1 and max_leaf_nodes=30:
0.857 +/- 0.003
Accuracy score via cross-validation with learning_rate=1 and max_leaf_nodes=3:
0.851 +/- 0.000
Accuracy score via cross-validation with learning_rate=1 and max_leaf_nodes=10:
0.833 +/- 0.002
Accuracy score via cross-validation with learning_rate=1 and max_leaf_nodes=30:
0.804 +/- 0.028
Accuracy score via cross-validation with learning_rate=10 and max_leaf_nodes=3:
0.288 +/- 0.007
Accuracy score via cross-val

In [10]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                     transformers=[('cat-preprocessor',
                                    OrdinalEncoder(handle_unknown='use_encoded_value',
                                                   unknown_value=-1),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x00000250F8659D90>)])),
  ('classifier',
   HistGradientBoostingClassifier(learning_rate=10, max_leaf_nodes=30,
                                  random_state=42))],
 'verbose': False,
 'preprocessor': ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                   transformers=[('cat-preprocessor',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  <sklearn.compose._column_transformer.make_column_selector object 

## Solution

In [None]:
learning_rate = [0.01, 0.1, 1, 10]
max_leaf_nodes = [3, 10, 30]

best_score = 0
best_params = {}
for lr in learning_rate:
    for mln in max_leaf_nodes:
        print(f"Evaluating model with learning rate {lr:.3f}"
              f" and max leaf nodes {mln}... ", end="")
        model.set_params(
            classifier__learning_rate=lr,
            classifier__max_leaf_nodes=mln
        )
        scores = cross_val_score(model, data_train, target_train, cv=2)
        mean_score = scores.mean()
        print(f"score: {mean_score:.3f}")
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'learning-rate': lr, 'max leaf nodes': mln}
            print(f"Found new best model with score {best_score:.3f}!")

print(f"The best accuracy obtained is {best_score:.3f}")
print(f"The best parameters found are:\n {best_params}")

Evaluating model with learning rate 0.010 and max leaf nodes 3... score: 0.789
Found new best model with score 0.789!
Evaluating model with learning rate 0.010 and max leaf nodes 10... score: 0.813
Found new best model with score 0.813!
Evaluating model with learning rate 0.010 and max leaf nodes 30... score: 0.842
Found new best model with score 0.842!
Evaluating model with learning rate 0.100 and max leaf nodes 3... score: 0.847
Found new best model with score 0.847!
Evaluating model with learning rate 0.100 and max leaf nodes 10... score: 0.859
Found new best model with score 0.859!
Evaluating model with learning rate 0.100 and max leaf nodes 30... score: 0.857
Evaluating model with learning rate 1.000 and max leaf nodes 3... 