# Preparing data

In [1]:
import catboost
from catboost.datasets import titanic

titanic_train, titanic_test = titanic()
titanic_train_target = titanic_train.Survived

titanic_train.drop(['PassengerId', 'Survived', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
titanic_test.drop(['PassengerId', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

titanic_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare
0,3,male,22.0,1,7.25
1,1,female,38.0,1,71.2833
2,3,female,26.0,0,7.925


# Simple grid search

In [2]:
titanic_model = catboost.CatBoostClassifier(
    iterations=1000)

In [3]:
train_pool = catboost.Pool(titanic_train, titanic_train_target, cat_features=['Pclass', 'Sex', 'SibSp'])
test_pool = catboost.Pool(titanic_test, cat_features=['Pclass', 'Sex', 'SibSp'])

In [4]:
grid = {
    'learning_rate': [0.03, 0.1],
    'depth':[4, 6, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}
grid_search_results = titanic_model.grid_search(grid, train_pool, shuffle=False, verbose=3)

0:	loss: 0.3602654	best: 0.3602654 (0)	total: 26.9s	remaining: 12m 58s
1:	loss: 0.3494103	best: 0.3494103 (1)	total: 54.4s	remaining: 12m 42s
2:	loss: 0.3641982	best: 0.3494103 (1)	total: 1m 22s	remaining: 12m 22s
3:	loss: 0.3607842	best: 0.3494103 (1)	total: 1m 51s	remaining: 12m 1s
4:	loss: 0.3614755	best: 0.3494103 (1)	total: 2m 19s	remaining: 11m 36s
5:	loss: 0.3688638	best: 0.3494103 (1)	total: 2m 47s	remaining: 11m 9s
6:	loss: 0.3661836	best: 0.3494103 (1)	total: 3m 15s	remaining: 10m 42s
7:	loss: 0.3589304	best: 0.3494103 (1)	total: 3m 44s	remaining: 10m 16s
8:	loss: 0.3679994	best: 0.3494103 (1)	total: 4m 11s	remaining: 9m 45s
9:	loss: 0.3648804	best: 0.3494103 (1)	total: 4m 36s	remaining: 9m 13s
10:	loss: 0.3544873	best: 0.3494103 (1)	total: 5m 5s	remaining: 8m 47s
11:	loss: 0.3730255	best: 0.3494103 (1)	total: 5m 34s	remaining: 8m 21s
12:	loss: 0.3527641	best: 0.3494103 (1)	total: 6m 2s	remaining: 7m 54s
13:	loss: 0.3589757	best: 0.3494103 (1)	total: 6m 30s	remaining: 7m 26s


Parameters giving the best value of the loss function:

In [5]:
grid_search_results['params']

{'depth': 4, 'l2_leaf_reg': 1, 'learning_rate': 0.1}

Available cross-validation statistics

In [6]:
grid_search_results['cv_results'].keys()

dict_keys(['test-Logloss-mean', 'test-Logloss-std', 'train-Logloss-mean', 'train-Logloss-std', 'iterations'])

Quality estimated using cross-validation:

In [7]:
grid_search_results['cv_results']['test-Logloss-mean'][-1]

0.4919203237312826

Model is ready to use after searching:

In [8]:
predicted = titanic_model.predict_proba(test_pool)
predicted[:3]

array([[8.99675512e-01, 1.00324488e-01],
       [9.99798150e-01, 2.01850144e-04],
       [7.35134536e-01, 2.64865464e-01]])

# Searching over several grids

In [9]:
grid_1 = {
    'learning_rate': [0.03, 0.1, 0.3, 0.9],
    'bootstrap_type':['Bayesian'],
    'bagging_temperature': [0, 1, 10]
}
grid_2 = {
    'learning_rate': [0.03, 0.1, 0.3, 0.9],
    'bootstrap_type':['Bernoulli'],
    'subsample': [0.66, 0.7, 0.8]
}
grid_search_results = titanic_model.grid_search([grid_1, grid_2], train_pool, shuffle=False, verbose=4)

Grid #0
0:	loss: 0.3822197	best: 0.3822197 (0)	total: 25.8s	remaining: 4m 44s
1:	loss: 0.3643472	best: 0.3643472 (1)	total: 50.6s	remaining: 4m 13s
2:	loss: 0.3636651	best: 0.3636651 (2)	total: 1m 14s	remaining: 3m 42s
3:	loss: 0.3690042	best: 0.3636651 (2)	total: 1m 36s	remaining: 3m 13s
4:	loss: 0.3602654	best: 0.3602654 (4)	total: 2m 2s	remaining: 2m 51s
5:	loss: 0.3494103	best: 0.3494103 (5)	total: 2m 28s	remaining: 2m 28s
6:	loss: 0.3334654	best: 0.3334654 (6)	total: 2m 53s	remaining: 2m 3s
7:	loss: 0.4026886	best: 0.3334654 (6)	total: 3m 17s	remaining: 1m 38s
8:	loss: 0.3625327	best: 0.3334654 (6)	total: 3m 44s	remaining: 1m 14s
9:	loss: 0.3500318	best: 0.3334654 (6)	total: 4m 10s	remaining: 50.2s
10:	loss: 0.3486949	best: 0.3334654 (6)	total: 4m 35s	remaining: 25.1s
11:	loss: 0.3626046	best: 0.3334654 (6)	total: 4m 59s	remaining: 0us
Grid #1
0:	loss: 0.3603296	best: 0.3603296 (0)	total: 26.1s	remaining: 4m 47s
1:	loss: 0.3568793	best: 0.3568793 (1)	total: 52.3s	remaining: 4m 21s

In [10]:
grid_search_results['params']

{'bootstrap_type': 'Bernoulli', 'learning_rate': 0.3, 'subsample': 0.7}

# Randomized search

In [11]:
from scipy import stats

class StrangeDistribution:
    def __init__(self, values):
        self.values = values

    def rvs(self):
        return self.values[0]

param_distribution = {
    'one_hot_max_size': stats.bernoulli(p=0.2, loc=2),
    'learning_rate': StrangeDistribution([0.03, 0.1]),
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'depth': stats.binom(n=10, p=0.2)
}

randomized_search_results = titanic_model.randomized_search(
    param_distribution,
    train_pool,
    n_iter=12,
    shuffle=False
)

0:	loss: 0.3750468	best: 0.3750468 (0)	total: 23.6s	remaining: 4m 19s
1:	loss: 0.6544126	best: 0.3750468 (0)	total: 39.3s	remaining: 3m 16s
2:	loss: 0.3748575	best: 0.3748575 (2)	total: 1m 3s	remaining: 3m 11s
3:	loss: 0.6544129	best: 0.3748575 (2)	total: 1m 19s	remaining: 2m 39s
4:	loss: 0.3746511	best: 0.3746511 (4)	total: 1m 46s	remaining: 2m 28s
5:	loss: 0.4227691	best: 0.3746511 (4)	total: 2m 7s	remaining: 2m 7s
6:	loss: 0.3649895	best: 0.3649895 (6)	total: 2m 34s	remaining: 1m 50s
7:	loss: 0.6544129	best: 0.3649895 (6)	total: 2m 50s	remaining: 1m 25s
8:	loss: 0.3717568	best: 0.3649895 (6)	total: 3m 16s	remaining: 1m 5s
9:	loss: 0.3705661	best: 0.3649895 (6)	total: 3m 41s	remaining: 44.4s
10:	loss: 0.3680459	best: 0.3649895 (6)	total: 4m 9s	remaining: 22.7s
11:	loss: 0.4246879	best: 0.3649895 (6)	total: 4m 32s	remaining: 0us
Estimating final quality...


In [12]:
randomized_search_results['params']

{'depth': 4.0,
 'l2_leaf_reg': 5,
 'learning_rate': 0.03,
 'one_hot_max_size': 2.0}

In [13]:
# If search_by_train_test_split=False, every iteration of grid search evaluates results on cross-validation.
randomized_search_results = titanic_model.randomized_search(
    param_distribution,
    train_pool,
    n_iter=6,
    shuffle=False,
    search_by_train_test_split=False
)

0:	loss: 0.4434842	best: 0.4434842 (0)	total: 2m 7s	remaining: 10m 39s
1:	loss: 0.4402693	best: 0.4402693 (1)	total: 4m 19s	remaining: 8m 38s
2:	loss: 0.4644695	best: 0.4402693 (1)	total: 6m 30s	remaining: 6m 30s
3:	loss: 0.4422576	best: 0.4402693 (1)	total: 8m 48s	remaining: 4m 24s
4:	loss: 0.6659128	best: 0.4402693 (1)	total: 10m 38s	remaining: 2m 7s
5:	loss: 0.4351239	best: 0.4351239 (5)	total: 14m 12s	remaining: 0us


In [14]:
randomized_search_results['params']

{'depth': 3.0,
 'l2_leaf_reg': 7,
 'learning_rate': 0.03,
 'one_hot_max_size': 3.0}