In [1]:
import gc
import warnings
import numpy as np
import pandas as pd
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    precision_recall_curve,
    roc_auc_score,
    PrecisionRecallDisplay,
    log_loss,
    RocCurveDisplay
)

from intent_model.preprocessing.filters import only_successful_trips
from intent_model.preprocessing.targets import rh_vs_rest_target

In [2]:
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_parquet('dataset_binary_v2.pq')

In [4]:
data.columns

Index(['valid_date', 'ts', 'sessionuuid', 'customer_id', 'booking_id',
       'latitude', 'longitude', 'is_trip_ended', 'quantile', 'rh',
       'min_dist_to_known_loc', 'known_loc_occ', 'is_freq', 'rh_frac',
       'norm_week', 'norm_hour', 'dist_to_most_freq', 'weekday', 'time_sin',
       'time_cos'],
      dtype='object')

In [5]:
data.head()

Unnamed: 0,valid_date,ts,sessionuuid,customer_id,booking_id,latitude,longitude,is_trip_ended,quantile,rh,min_dist_to_known_loc,known_loc_occ,is_freq,rh_frac,norm_week,norm_hour,dist_to_most_freq,weekday,time_sin,time_cos
0,2023-07-02,2023-07-02 12:13:03,0000011f-5182-48be-b496-c1a50891655d,53671267,1777558859,25.09,55.175,1,0.95,1,0.0,0.804878,1,0.286713,0.229039,0.558661,0.0,6,-0.056693,-0.998392
1,2023-05-23,2023-05-23 08:23:08,6865AEC9-13A5-4164-B861-B4239FA4CD86,56071529,1757410568,25.207,55.248,1,0.96,1,0.0,0.888889,0,0.789474,0.21725,0.084819,0.0,1,0.811574,-0.58425
2,2023-05-31,2023-05-31 20:57:23,0000014D-935E-4BFB-8582-6107004CCAC5,48719113,1762031191,25.095,55.154,1,0.91,1,0.0,0.384615,1,0.866667,0.342997,0.615457,0.09051,2,-0.716302,0.69779
3,2023-05-23,2023-05-23 09:03:47,32d45eed-6a93-4a9b-8df4-34cf8a94a36f,61105461,1757431140,25.247,55.304,1,0.82,1,0.001414,0.769231,1,1.0,0.780869,0.384111,0.001414,1,0.69779,-0.716302
4,2023-05-23,2023-05-23 09:31:08,215a96f7-0b0c-4785-8f15-e2f155ea4f13,16459670,1757474445,31.951,35.867,1,0.95,1,0.001,0.72093,1,1.0,0.38949,0.885044,0.026,1,0.605294,-0.796002


In [6]:
CAT = ['weekday']

TO_DROP = [
    'valid_date', 'ts', 'sessionuuid', 'customer_id', 'booking_id',
    'is_trip_ended', 'rh', 'is_freq', 'target'
]

In [7]:
data = rh_vs_rest_target(data)

In [8]:
data.target.value_counts()

1    7374273
0    2709396
Name: target, dtype: int64

### Preparation

In [9]:
dates = np.sort(data.valid_date.unique())

train = data[data.valid_date.isin(dates[:60])]
val = data[data.valid_date.isin(dates[60:75])]
test = data[data.valid_date.isin(dates[75:])]

In [10]:
train.target.value_counts()

1    4878116
0    1752148
Name: target, dtype: int64

In [11]:
train['sample_weight'] = 1.0
train.loc[(train.rh == 1) & (train.is_trip_ended == 0), 'sample_weight'] = 0.5

In [12]:
train_neg = train[train.target == 0]

train_pos, _ = train_test_split(
    train[train.target == 1],
    test_size=0.5,
    stratify=train[train.target == 1]['sample_weight'],
    random_state=111
)

train = pd.concat([train_neg, train_pos]).sample(frac=1.0).reset_index(drop=True)

In [13]:
train.target.value_counts()

1    2439058
0    1752148
Name: target, dtype: int64

In [14]:
cb_train = cb.Pool(
    train.sort_values(['ts', 'customer_id']).drop(TO_DROP + ['sample_weight'], axis=1),
    label=train.sort_values(['ts', 'customer_id']).target.values,
    cat_features=CAT,
    weight=train.sort_values(['ts', 'customer_id']).sample_weight.values
)

cb_val = cb.Pool(
    val.sort_values(['ts', 'customer_id']).drop(TO_DROP, axis=1),
    label=val.sort_values(['ts', 'customer_id']).target.values,
    cat_features=CAT
)

cb_test = test.sort_values(['ts', 'customer_id']).drop(TO_DROP, axis=1)
cb_test_y = test.sort_values(['ts', 'customer_id']).target.values

### Model

In [15]:
import optuna

In [16]:
def check_loss(model: cb.CatBoostClassifier, x: pd.DataFrame, y: np.ndarray) -> float:
    pred = model.predict_proba(x)[:, 1]
    return log_loss(y, pred)

In [17]:
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.05, 0.01)
    param['depth'] = trial.suggest_int('depth', 4, 10)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 10.0, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['langevin'] = trial.suggest_categorical("langevin", [True, False])
    param['grow_policy'] = trial.suggest_categorical("grow_policy", ['Depthwise', 'SymmetricTree'])
    param['has_time'] = trial.suggest_categorical("has_time", [True, False])
        
    param['iterations'] = 500
    param['use_best_model'] = True
    param['od_type'] = 'Iter'
    param['od_wait'] = 50
    param['random_state'] = 42
    param['logging_level'] = 'Silent'
    param['bootstrap_type']= 'Bernoulli'
    param['score_function'] = 'L2'
    
    param['subsample'] = trial.suggest_discrete_uniform('subsample', 0.85, 1.0, 0.05)
    
    clf = cb.CatBoostClassifier(**param)
    clf.fit(cb_train, eval_set=cb_val, early_stopping_rounds=50)
    return check_loss(clf, cb_test, cb_test_y)

In [18]:
study = optuna.create_study(study_name='catboost-seed', direction='minimize')
study.optimize(objective, n_jobs=9, timeout=21600)

[32m[I 2023-09-12 00:37:35,563][0m A new study created in memory with name: catboost-seed[0m
[32m[I 2023-09-12 00:55:04,606][0m Trial 3 finished with value: 0.43329403255145665 and parameters: {'learning_rate': 0.02, 'depth': 5, 'l2_leaf_reg': 3.5, 'min_child_samples': 1, 'langevin': False, 'grow_policy': 'SymmetricTree', 'has_time': True, 'subsample': 0.95}. Best is trial 3 with value: 0.43329403255145665.[0m
[32m[I 2023-09-12 00:56:25,566][0m Trial 2 finished with value: 0.4265772071515987 and parameters: {'learning_rate': 0.04, 'depth': 5, 'l2_leaf_reg': 9.5, 'min_child_samples': 32, 'langevin': True, 'grow_policy': 'SymmetricTree', 'has_time': True, 'subsample': 0.9}. Best is trial 2 with value: 0.4265772071515987.[0m
[32m[I 2023-09-12 00:59:59,570][0m Trial 0 finished with value: 0.4269753908281412 and parameters: {'learning_rate': 0.02, 'depth': 7, 'l2_leaf_reg': 2.5, 'min_child_samples': 32, 'langevin': True, 'grow_policy': 'SymmetricTree', 'has_time': True, 'subsampl

In [19]:
print(f'Best LogLoss: {study.best_value}')

Best LogLoss: 0.4113831965454151


In [20]:
print(f'Best params: {study.best_params}')

Best params: {'learning_rate': 0.05, 'depth': 10, 'l2_leaf_reg': 4.0, 'min_child_samples': 1, 'langevin': True, 'grow_policy': 'Depthwise', 'has_time': False, 'subsample': 0.85}


In [None]:
3