# IMPORT

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

from catboost import CatBoostRegressor
from catboost import Pool
from catboost import cv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

import neptune.new as neptune
import optuna
import neptune.new.integrations.optuna as optuna_utils
from joblib import dump

# DEFINE

In [2]:
DATA_PATH   = '../data/'
RANDOM_SEED = 42

In [3]:
data = pd.read_csv(DATA_PATH+'prep_data.csv')
train = data[data['sample']==1].drop(columns=['sample'])
test = data[data['sample']==0].drop(columns=['sample'])

In [4]:
def rating(prediction):
        if prediction < 0.25:
            return 0
        elif 0.25 < prediction <= 0.75:
            return 0.5
        elif 0.75 < prediction <= 1.25:
            return 1
        elif 1.25 <prediction <= 1.75:
            return 1.5
        elif 1.75 < prediction <= 2.25:
            return 2
        elif 2.25 < prediction <= 2.75:
            return 2.5
        elif 2.75 < prediction <= 3.25:
            return 3
        elif 3.25 < prediction <= 3.75:
            return 3.5
        elif 3.75 < prediction <= 4.25:
            return 4
        elif 4.25 < prediction <= 4.75:
            return 4.5
        else:
            return 5

In [5]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    mape = mean_absolute_percentage_error(actual, pred)
    return rmse, mae, r2, mape

In [8]:
def nt_catboost(data, lr, depth, loss_function):
    
    df = data.copy()
    df.reset_index(inplace=True)
    
    run = neptune.init(project='alxkzncoff/trip-rating',
                       api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI5ZmI4NDE1Ny03YzcyLTQwOTQtOTUwMi1kODlkYWMwY2YwOGQifQ==',
                       name = 'catboost')



    X = df.drop(['Rating'], axis=1)
    y = df['Rating']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=RANDOM_SEED)
    
    train_data = Pool(data = X_train,
                      label = y_train,
                      cat_features =  ['City', 'County'])

    test_data = Pool(data = X_test,
                    label = y_test,
                    cat_features =  ['City', 'County'])
    
    ctb = CatBoostRegressor( loss_function = loss_function,
                             eval_metric = 'MAE',
                             learning_rate=lr,
                             iterations=25000, # обучаем до overfitting'a
                             depth=depth,
                             one_hot_max_size = 5,
                             random_seed=RANDOM_SEED,
                             od_type='Iter',
                             od_wait=500,
                             task_type='CPU')
    
    model = ctb.fit(train_data,
                    eval_set=test_data,
                    verbose_eval=500,
                    use_best_model=True)

    print(model)

    predict = model.predict(X_test)
    
    for i in range(predict.size):
        predict[i]=rating(predict[i])
    
    (rmse, mae, r2, mape) = eval_metrics(y_test, predict)

    run["sys/tags"].add(['catboost'])
    run['catboost/parameters'] = {'loss_function': loss_function,
                                             'learning_rate': lr,
                                             'depth': depth}
    run['catboost/rmse'] = np.round(rmse, 2)
    run['catboost/mae'] = np.round(mae, 2)
    run['catboost/r2'] = np.round(r2, 2)
    run['catboost/mape'] = np.round(mape, 2)
    run["catboost/model"].upload('catboost.pkl')

    print(f'RMSE: {np.round(rmse, 2)} | MAE: {np.round(mae, 2)} | R2: {np.round(r2, 2)} | MAPE: {np.round(mape, 2)}')

    dump(ctb, 'catboost_model.pkl')

    return 

In [9]:
nt_catboost(train, 0.1, 5, 'MAE')

https://app.neptune.ai/alxkzncoff/trip-rating/e/TRIP-19
0:	learn: 0.4692842	test: 0.4606242	best: 0.4606242 (0)	total: 31.7ms	remaining: 13m 12s
500:	learn: 0.2166216	test: 0.2228300	best: 0.2228300 (500)	total: 13.7s	remaining: 11m 8s
1000:	learn: 0.2048091	test: 0.2161815	best: 0.2161532 (993)	total: 30.2s	remaining: 12m 4s
1500:	learn: 0.1985659	test: 0.2146191	best: 0.2146169 (1498)	total: 44.3s	remaining: 11m 33s
2000:	learn: 0.1943442	test: 0.2140768	best: 0.2140699 (1969)	total: 58.1s	remaining: 11m 7s
2500:	learn: 0.1909094	test: 0.2134029	best: 0.2134029 (2500)	total: 1m 11s	remaining: 10m 44s
3000:	learn: 0.1883650	test: 0.2132448	best: 0.2131776 (2873)	total: 1m 24s	remaining: 10m 21s
3500:	learn: 0.1862784	test: 0.2131300	best: 0.2131027 (3456)	total: 1m 36s	remaining: 9m 55s
4000:	learn: 0.1843467	test: 0.2130808	best: 0.2130304 (3898)	total: 1m 50s	remaining: 9m 37s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.2130303548
bestIteration = 3898

Shrin