# Preparing data

In [1]:
import catboost
from catboost.datasets import titanic

titanic_train, titanic_test = titanic()
titanic_train_target = titanic_train.Survived

titanic_train.drop(['PassengerId', 'Survived', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
titanic_test.drop(['PassengerId', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

titanic_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare
0,3,male,22.0,1,7.25
1,1,female,38.0,1,71.2833
2,3,female,26.0,0,7.925


# Simple grid search

In [2]:
titanic_model = catboost.CatBoostClassifier(
    iterations=1000)

In [3]:
train_pool = catboost.Pool(titanic_train, titanic_train_target, cat_features=['Pclass', 'Sex', 'SibSp'])
test_pool = catboost.Pool(titanic_test, cat_features=['Pclass', 'Sex', 'SibSp'])

In [4]:
grid = {
    'learning_rate': [0.03, 0.1],
    'depth':[4, 6, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}
grid_search_results = titanic_model.grid_search(grid, train_pool, shuffle=False, verbose=3)

0:	loss: 0.3602654	best: 0.3602654 (0)	total: 25.9s	remaining: 12m 32s
1:	loss: 0.3494103	best: 0.3494103 (1)	total: 51.5s	remaining: 12m 1s
2:	loss: 0.3641982	best: 0.3494103 (1)	total: 1m 18s	remaining: 11m 44s
3:	loss: 0.3607842	best: 0.3494103 (1)	total: 1m 44s	remaining: 11m 18s
4:	loss: 0.3614755	best: 0.3494103 (1)	total: 2m 11s	remaining: 10m 56s
5:	loss: 0.3688638	best: 0.3494103 (1)	total: 2m 37s	remaining: 10m 30s
6:	loss: 0.3661836	best: 0.3494103 (1)	total: 3m 4s	remaining: 10m 6s
7:	loss: 0.3589304	best: 0.3494103 (1)	total: 3m 30s	remaining: 9m 39s
8:	loss: 0.3679994	best: 0.3494103 (1)	total: 3m 57s	remaining: 9m 13s
9:	loss: 0.3648804	best: 0.3494103 (1)	total: 4m 23s	remaining: 8m 47s
10:	loss: 0.3544873	best: 0.3494103 (1)	total: 4m 51s	remaining: 8m 24s
11:	loss: 0.3730255	best: 0.3494103 (1)	total: 5m 20s	remaining: 8m 1s
12:	loss: 0.3527641	best: 0.3494103 (1)	total: 5m 49s	remaining: 7m 37s
13:	loss: 0.3589757	best: 0.3494103 (1)	total: 6m 19s	remaining: 7m 13s
1

Parameters giving the best value of the loss function:

In [5]:
grid_search_results['params']

{'depth': 4, 'l2_leaf_reg': 1, 'learning_rate': 0.1}

Quality estimated using cross-validation:

In [9]:
grid_search_results['cv_results']

defaultdict(list,
            {'iterations': [0,
              1,
              2,
              3,
              4,
              5,
              6,
              7,
              8,
              9,
              10,
              11,
              12,
              13,
              14,
              15,
              16,
              17,
              18,
              19,
              20,
              21,
              22,
              23,
              24,
              25,
              26,
              27,
              28,
              29,
              30,
              31,
              32,
              33,
              34,
              35,
              36,
              37,
              38,
              39,
              40,
              41,
              42,
              43,
              44,
              45,
              46,
              47,
              48,
              49,
              50,
              51,
              52,
              53,
      

Model is ready to use after searching:

In [10]:
predicted = titanic_model.predict_proba(test_pool)
predicted[:3]

array([[8.99675512e-01, 1.00324488e-01],
       [9.99798150e-01, 2.01850144e-04],
       [7.35134536e-01, 2.64865464e-01]])

# Searching over several grids

In [11]:
grid_1 = {
    'learning_rate': [0.03, 0.1, 0.3, 0.9],
    'bootstrap_type':['Bayesian'],
    'bagging_temperature': [0, 1, 10]
}
grid_2 = {
    'learning_rate': [0.03, 0.1, 0.3, 0.9],
    'bootstrap_type':['Bernoulli'],
    'subsample': [0.66, 0.7, 0.8]
}
grid_search_results = titanic_model.grid_search([grid_1, grid_2], train_pool, shuffle=False, verbose=4)

Grid #0
0:	loss: 0.3822197	best: 0.3822197 (0)	total: 25.7s	remaining: 4m 42s
1:	loss: 0.3643472	best: 0.3643472 (1)	total: 49.6s	remaining: 4m 8s
2:	loss: 0.3636651	best: 0.3636651 (2)	total: 1m 12s	remaining: 3m 36s
3:	loss: 0.3690042	best: 0.3636651 (2)	total: 1m 34s	remaining: 3m 9s
4:	loss: 0.3602654	best: 0.3602654 (4)	total: 2m	remaining: 2m 48s
5:	loss: 0.3494103	best: 0.3494103 (5)	total: 2m 25s	remaining: 2m 25s
6:	loss: 0.3334654	best: 0.3334654 (6)	total: 2m 50s	remaining: 2m 1s
7:	loss: 0.4026886	best: 0.3334654 (6)	total: 3m 13s	remaining: 1m 36s
8:	loss: 0.3625327	best: 0.3334654 (6)	total: 3m 39s	remaining: 1m 13s
9:	loss: 0.3500318	best: 0.3334654 (6)	total: 4m 6s	remaining: 49.2s
10:	loss: 0.3486949	best: 0.3334654 (6)	total: 4m 30s	remaining: 24.6s
11:	loss: 0.3626046	best: 0.3334654 (6)	total: 4m 53s	remaining: 0us
Grid #1
0:	loss: 0.3603296	best: 0.3603296 (0)	total: 25.7s	remaining: 4m 42s
1:	loss: 0.3568793	best: 0.3568793 (1)	total: 51.1s	remaining: 4m 15s
2:	lo

In [12]:
grid_search_results['params']

{'bootstrap_type': 'Bernoulli', 'learning_rate': 0.3, 'subsample': 0.7}

# Randomized search

In [13]:
from scipy import stats

class StrangeDistribution:
    def __init__(self, values):
        self.values = values

    def rvs(self):
        return self.values[0]

param_distribution = {
    'one_hot_max_size': stats.bernoulli(p=0.2, loc=2),
    'learning_rate': StrangeDistribution([0.03, 0.1]),
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'depth': stats.binom(n=10, p=0.2)
}

randomized_search_results = titanic_model.randomized_search(
    param_distribution,
    train_pool,
    n_iter=12,
    shuffle=False
)

0:	loss: 0.4226621	best: 0.4226621 (0)	total: 20.6s	remaining: 3m 47s
1:	loss: 0.4230480	best: 0.4226621 (0)	total: 41.5s	remaining: 3m 27s
2:	loss: 0.3748575	best: 0.3748575 (2)	total: 1m 5s	remaining: 3m 15s
3:	loss: 0.6544129	best: 0.3748575 (2)	total: 1m 21s	remaining: 2m 42s
4:	loss: 0.4227770	best: 0.3748575 (2)	total: 1m 43s	remaining: 2m 24s
5:	loss: 0.4227770	best: 0.3748575 (2)	total: 2m 5s	remaining: 2m 5s
6:	loss: 0.6544129	best: 0.3748575 (2)	total: 2m 21s	remaining: 1m 41s
7:	loss: 0.4227770	best: 0.3748575 (2)	total: 2m 42s	remaining: 1m 21s
8:	loss: 0.3754914	best: 0.3748575 (2)	total: 3m 7s	remaining: 1m 2s
9:	loss: 0.3782714	best: 0.3748575 (2)	total: 3m 31s	remaining: 42.4s
10:	loss: 0.4246879	best: 0.3748575 (2)	total: 3m 53s	remaining: 21.2s
11:	loss: 0.4246879	best: 0.3748575 (2)	total: 4m 15s	remaining: 0us
Estimating final quality...


In [14]:
randomized_search_results['params']

{'depth': 2.0,
 'l2_leaf_reg': 3,
 'learning_rate': 0.03,
 'one_hot_max_size': 2.0}

In [15]:
# If search_by_train_test_split=False, every iteration of grid search evaluates results on cross-validation.
randomized_search_results = titanic_model.randomized_search(
    param_distribution,
    train_pool,
    n_iter=6,
    shuffle=False,
    search_by_train_test_split=False
)

0:	loss: 0.4435890	best: 0.4435890 (0)	total: 2m 11s	remaining: 10m 56s
1:	loss: 0.4418447	best: 0.4418447 (1)	total: 4m 25s	remaining: 8m 51s
2:	loss: 0.4395964	best: 0.4395964 (2)	total: 6m 41s	remaining: 6m 41s
3:	loss: 0.4355671	best: 0.4355671 (3)	total: 10m 6s	remaining: 5m 3s
4:	loss: 0.4650220	best: 0.4355671 (3)	total: 12m 31s	remaining: 2m 30s
5:	loss: 0.4382148	best: 0.4355671 (3)	total: 14m 42s	remaining: 0us


In [16]:
randomized_search_results['params']

{'depth': 3.0,
 'l2_leaf_reg': 5,
 'learning_rate': 0.03,
 'one_hot_max_size': 2.0}