<a href="https://colab.research.google.com/github/ariahosseini/TradML/blob/main/ML_CatBoost_ProjOne.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import hyperopt
import numpy as np
import pandas as pd
from numpy.random import RandomState
from sklearn.metrics import accuracy_score
from catboost.datasets import titanic
from catboost import CatBoostClassifier, MetricVisualizer, Pool, metrics, cv
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = titanic()
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [None]:
print("Number of unique values:")
for col in train_df.columns:
    print(col, ":", train_df[col].nunique())

Number of unique values:
PassengerId : 891
Survived : 2
Pclass : 3
Name : 891
Sex : 2
Age : 88
SibSp : 7
Parch : 7
Ticket : 681
Fare : 248
Cabin : 147
Embarked : 3


In [None]:
print("Number of null values:")
null_vals = train_df.isnull().sum(axis=0)
null_vals[null_vals!=0]

Number of null values:


Age         177
Cabin       687
Embarked      2
dtype: int64

In [None]:
train_df.fillna(-9999, inplace=True)
test_df.fillna(-9999, inplace=True)

In [None]:
print("Number of null values:")
null_vals = train_df.isnull().sum(axis=0)
null_vals[null_vals!=0]

Number of null values:


Series([], dtype: int64)

In [None]:
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

In [None]:
X.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [None]:
categorical_features_indices = np.where(X.dtypes != float)[0]

In [None]:
categorical_features_indices

array([ 0,  1,  2,  3,  5,  6,  7,  9, 10])

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)
X_test = test_df

In [None]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [None]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent',
    early_stopping_rounds=50
)
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    logging_level='Silent',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x118bf1d10>

In [None]:
cv_params = model.get_params()
cv_params

{'random_seed': 42,
 'logging_level': 'Silent',
 'custom_loss': ['Accuracy'],
 'early_stopping_rounds': 50}

In [None]:
cv_params.update(
    {
    'loss_function': metrics.Logloss()
    }
)
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
cv_data.head()

Unnamed: 0,iterations,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std,test-Accuracy-mean,test-Accuracy-std,train-Accuracy-mean,train-Accuracy-std
0,0,0.676936,0.001133,0.676477,0.003152,0.794613,0.003367,0.798541,0.020778
1,1,0.660661,0.000697,0.659381,0.003172,0.795735,0.030365,0.812009,0.010286
2,2,0.646543,0.00192,0.645228,0.004168,0.803591,0.028636,0.812009,0.014119
3,3,0.632857,0.003376,0.631048,0.004247,0.804714,0.026725,0.81257,0.012179
4,4,0.61975,0.004936,0.617523,0.005041,0.803591,0.026153,0.813692,0.011459


In [None]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Best validation accuracy score: 0.82±0.02 on step 110
Precise validation accuracy score: 0.8159371492704827


In [None]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.85963998 0.14036002]
 [0.73941007 0.26058993]
 [0.88495066 0.11504934]
 [0.89117181 0.10882819]
 [0.36045113 0.63954887]
 [0.90335455 0.09664545]
 [0.35404814 0.64595186]
 [0.76848491 0.23151509]
 [0.40015216 0.59984784]
 [0.94172558 0.05827442]]


In [None]:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [None]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.7982

Best model validation accuracy: 0.8251


In [None]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))

print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.7982

Early-stopped model tree count: 82
Early-stopped model validation accuracy: 0.8072


In [None]:
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline);

In [None]:
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.1,
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)

0:	learn: 0.8053892	test: 0.7937220	best: 0.7937220 (0)	total: 3.35ms	remaining: 13.4ms
1:	learn: 0.8008982	test: 0.7982063	best: 0.7982063 (1)	total: 6.1ms	remaining: 9.15ms
2:	learn: 0.8008982	test: 0.7937220	best: 0.7982063 (1)	total: 8.62ms	remaining: 5.75ms
3:	learn: 0.8113772	test: 0.7892377	best: 0.7982063 (1)	total: 11.4ms	remaining: 2.86ms
4:	learn: 0.8173653	test: 0.8026906	best: 0.8026906 (4)	total: 13.4ms	remaining: 0us

bestTest = 0.802690583
bestIteration = 4

5:	learn: 0.8173653	test: 0.8026906	best: 0.8026906 (4)	total: 18.6ms	remaining: 20.9ms
6:	learn: 0.8248503	test: 0.8026906	best: 0.8026906 (4)	total: 20.4ms	remaining: 10.5ms
7:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 23.3ms	remaining: 6.6ms
8:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 24.3ms	remaining: 2.72ms
9:	learn: 0.8233533	test: 0.8026906	best: 0.8026906 (4)	total: 26.1ms	remaining: 0us

bestTest = 0.802690583
bestIteration = 4



In [None]:
class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers which have only __len__ and __getitem__ defined).
        # weights parameter can be None.
        #
        # To understand what these parameters mean, assume that there is
        # a subset of your dataset that is currently being processed.
        # approxes contains current predictions for this subset,
        # targets contains target values you provided with the dataset.
        #
        # This function should return a list of pairs (der1, der2), where
        # der1 is the first derivative of the loss function with respect
        # to the predicted value, and der2 is the second derivative.
        #
        # In our case, logloss is defined by the following formula:
        # target * log(sigmoid(approx)) + (1 - target) * (1 - sigmoid(approx))
        # where sigmoid(x) = 1 / (1 + e^(-x)).

        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result
model = CatBoostClassifier(
    iterations=10,
    random_seed=42,
    loss_function=LoglossObjective(),
    eval_metric=metrics.Logloss()
)
model.fit(train_pool)
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

0:	learn: 0.6827074	total: 444ms	remaining: 3.99s
1:	learn: 0.6723302	total: 446ms	remaining: 1.78s
2:	learn: 0.6619449	total: 446ms	remaining: 1.04s
3:	learn: 0.6521466	total: 448ms	remaining: 671ms
4:	learn: 0.6435227	total: 449ms	remaining: 449ms
5:	learn: 0.6353848	total: 450ms	remaining: 300ms
6:	learn: 0.6277210	total: 452ms	remaining: 194ms
7:	learn: 0.6210282	total: 453ms	remaining: 113ms
8:	learn: 0.6141958	total: 455ms	remaining: 50.5ms
9:	learn: 0.6073236	total: 456ms	remaining: 0us


In [None]:
class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.

        # weight parameter can be None.
        # Returns pair (error, weights sum)

        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

model = CatBoostClassifier(
    iterations=10,
    random_seed=42,
    loss_function=metrics.Logloss(),
    eval_metric=LoglossMetric()
)
model.fit(train_pool)
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.5521578	total: 201ms	remaining: 1.81s
1:	learn: 0.4885686	total: 203ms	remaining: 811ms
2:	learn: 0.4607664	total: 205ms	remaining: 478ms
3:	learn: 0.4418819	total: 206ms	remaining: 310ms
4:	learn: 0.4278162	total: 208ms	remaining: 208ms
5:	learn: 0.4151036	total: 210ms	remaining: 140ms
6:	learn: 0.4099336	total: 212ms	remaining: 90.7ms
7:	learn: 0.4095363	total: 213ms	remaining: 53.2ms
8:	learn: 0.4032867	total: 215ms	remaining: 23.8ms
9:	learn: 0.3929586	total: 216ms	remaining: 0us


In [None]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
ntree_start, ntree_end, eval_period = 3, 9, 2
predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)
for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

First class probabilities using the first 3 trees: [0.53597869 0.41039128 0.42057479 0.64281031 0.46576685]
First class probabilities using the first 5 trees: [0.63722688 0.42492029 0.46209302 0.70926021 0.44280772]
First class probabilities using the first 7 trees: [0.66964764 0.42409144 0.46124982 0.76101033 0.47205986]


In [None]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 59.0040920142686
Pclass: 16.340887169747038
Ticket: 6.028107169932206
Cabin: 3.8347242202560192
Fare: 3.712969667934385
Age: 3.4844512041824824
Parch: 3.378089740355865
Embarked: 2.313999407289956
SibSp: 1.902679406033451
PassengerId: 0.0
Name: 0.0


In [None]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, [metrics.AUC()], plot=True)
print(eval_metrics['AUC'][:6])

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[0.8627368774106994, 0.8623176253563642, 0.8602213650846889, 0.8514170719436525, 0.8495723629045783, 0.8569092738554419]


In [None]:
model1 = CatBoostClassifier(iterations=100, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=100, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
# # model.save_model('catboost_model.dump')
model = CatBoostClassifier()
# model.load_model('catboost_model.dump');

In [None]:
def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric=metrics.Accuracy(),
        random_seed=42,
        verbose=False,
        loss_function=metrics.Logloss(),
    )

    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        logging_level='Silent'
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])

    return 1 - best_accuracy

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)
print(best)

In [None]:
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric=metrics.Accuracy(),
    random_seed=42,
    verbose=False,
    loss_function=metrics.Logloss(),
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))
model.fit(X, y, cat_features=categorical_features_indices)
submisstion = pd.DataFrame()
submisstion['PassengerId'] = X_test['PassengerId']
submisstion['Survived'] = model.predict(X_test)
submisstion.to_csv('submission.csv', index=False)