In [1]:
import catboost
print(catboost.__version__)
!python --version

0.16
Python 3.5.2


# Preparing data

In [2]:
from catboost.datasets import titanic

titanic_train, titanic_test = titanic()
titanic_train_target = titanic_train.Survived

titanic_train.drop(['PassengerId', 'Survived', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
titanic_test.drop(['PassengerId', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

titanic_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare
0,3,male,22.0,1,7.25
1,1,female,38.0,1,71.2833
2,3,female,26.0,0,7.925


# Simple grid search

In [3]:
titanic_model = catboost.CatBoostClassifier(
    iterations=20)

In [4]:
train_pool = catboost.Pool(titanic_train, titanic_train_target, cat_features=['Pclass', 'Sex', 'SibSp'])
test_pool = catboost.Pool(titanic_test, cat_features=['Pclass', 'Sex', 'SibSp'])

In [5]:
grid = {
    'feature_border_type': ['Median', 'Uniform'],
    'one_hot_max_size': [2, 3],
    'learning_rate': [0.03, 0.1],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}
grid_search_results = titanic_model.grid_search(grid, train_pool, shuffle=False, verbose=3)

0:	Params searching. objective: 0.5068569	best: 0.5068569 (0)	total: 641ms	remaining: 25s
3:	Params searching. objective: 0.5037151	best: 0.4962506 (1)	total: 2.46s	remaining: 22.1s
6:	Params searching. objective: 0.4929148	best: 0.4882844 (5)	total: 4.15s	remaining: 19.6s
9:	Params searching. objective: 0.4995569	best: 0.4882844 (5)	total: 5.82s	remaining: 17.5s
12:	Params searching. objective: 0.4152935	best: 0.4152935 (12)	total: 7.5s	remaining: 15.6s
15:	Params searching. objective: 0.4151681	best: 0.4151681 (15)	total: 9.12s	remaining: 13.7s
18:	Params searching. objective: 0.4105241	best: 0.4085159 (16)	total: 10.8s	remaining: 11.9s
21:	Params searching. objective: 0.5032700	best: 0.4085159 (16)	total: 12.5s	remaining: 10.3s
24:	Params searching. objective: 0.5067932	best: 0.4085159 (16)	total: 14.2s	remaining: 8.51s
27:	Params searching. objective: 0.4962632	best: 0.4085159 (16)	total: 15.8s	remaining: 6.79s
30:	Params searching. objective: 0.4384475	best: 0.4085159 (16)	total: 

Parameters giving the best value of the loss function:

In [6]:
grid_search_results['params']

{'feature_border_type': 'Median',
 'l2_leaf_reg': 3,
 'learning_rate': 0.1,
 'one_hot_max_size': 3}

Model is ready to use after searching:

In [7]:
predicted = titanic_model.predict_proba(test_pool)
predicted[:3]

array([[0.81777377, 0.18222623],
       [0.50297593, 0.49702407],
       [0.75946883, 0.24053117]])

# Searching over several grids

In [8]:
grid_1 = {
    'feature_border_type': ['Median', 'Uniform'],
    'one_hot_max_size': [2, 3],
    'learning_rate': [0.03, 0.1],
    'l2_leaf_reg': [1, 3]
}
grid_2 = {
    'feature_border_type': ['Median', 'Uniform'],
    'learning_rate': [0.1, 0.2, 0.3],
    'l2_leaf_reg': [5, 7, 9]
}
grid_search_results = titanic_model.grid_search([grid_1, grid_2], train_pool, shuffle=False, verbose=4)

Grid #0
0:	Params searching. objective: 0.5068569	best: 0.5068569 (0)	total: 634ms	remaining: 9.52s
4:	Params searching. objective: 0.4339690	best: 0.4339690 (4)	total: 2.84s	remaining: 6.25s
8:	Params searching. objective: 0.5113267	best: 0.4085159 (7)	total: 5.17s	remaining: 4.02s
12:	Params searching. objective: 0.4384475	best: 0.4085159 (7)	total: 7.65s	remaining: 1.76s
15:	Params searching. objective: 0.4127494	best: 0.4085159 (7)	total: 9.44s	remaining: 0us
Grid #1
0:	Params searching. objective: 0.4150399	best: 0.4150399 (0)	total: 625ms	remaining: 10.6s
4:	Params searching. objective: 0.3953455	best: 0.3953455 (4)	total: 3.03s	remaining: 7.87s
8:	Params searching. objective: 0.3997993	best: 0.3667056 (7)	total: 5.41s	remaining: 5.41s
12:	Params searching. objective: 0.3849115	best: 0.3667056 (7)	total: 7.86s	remaining: 3.02s
16:	Params searching. objective: 0.3954113	best: 0.3667056 (7)	total: 10.2s	remaining: 599ms
17:	Params searching. objective: 0.3778527	best: 0.3667056 (7)

In [9]:
grid_search_results['params']

{'feature_border_type': 'Median', 'l2_leaf_reg': 7, 'learning_rate': 0.3}

# Randomized search

In [10]:
from scipy import stats

class StrangeDistribution:
    def __init__(self, values):
        self.values = values

    def rvs(self):
        return self.values[0]

param_distribution = {
    'one_hot_max_size': stats.bernoulli(p=0.2, loc=2),
    'iterations': StrangeDistribution([20, 500, 1000]),
    'border_count': [10, 6, 20, 4],
    'depth': stats.binom(n=10, p=0.2)
}

grid_search_results = titanic_model.randomized_search(
    param_distribution,
    train_pool,
    n_iter=12,
    shuffle=False,
    verbose=1
)


0:	Params searching. objective: 0.4400946	best: 0.4400946 (0)	total: 571ms	remaining: 6.28s
1:	Params searching. objective: 0.4487784	best: 0.4400946 (0)	total: 1.1s	remaining: 5.51s
2:	Params searching. objective: 0.4156919	best: 0.4156919 (2)	total: 1.69s	remaining: 5.07s
3:	Params searching. objective: 0.3571691	best: 0.3571691 (3)	total: 2.33s	remaining: 4.67s
4:	Params searching. objective: 0.4487784	best: 0.3571691 (3)	total: 2.81s	remaining: 3.93s
5:	Params searching. objective: 0.3944103	best: 0.3571691 (3)	total: 3.36s	remaining: 3.36s
6:	Params searching. objective: 0.4453991	best: 0.3571691 (3)	total: 3.86s	remaining: 2.76s
7:	Params searching. objective: 0.4013761	best: 0.3571691 (3)	total: 4.42s	remaining: 2.21s
8:	Params searching. objective: 0.4086284	best: 0.3571691 (3)	total: 5.03s	remaining: 1.68s
9:	Params searching. objective: 0.4451125	best: 0.3571691 (3)	total: 5.51s	remaining: 1.1s
10:	Params searching. objective: 0.4127659	best: 0.3571691 (3)	total: 6.05s	remain

In [11]:
grid_search_results['params']

{'border_count': 10, 'depth': 4.0, 'iterations': 20.0, 'one_hot_max_size': 3.0}