# CatBoost

In [91]:
from catboost import CatBoostRegressor
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split

In [92]:
# Inserting features from TRAIN into one dataset

folder_train = '../features_train_csv'

features_train = pd.read_csv(os.path.join(folder_train, os.listdir(folder_train)[0]))
for i in range(1,len(os.listdir(folder_train))):
    f = os.path.join(folder_train,os.listdir(folder_train)[i])
    if os.path.isfile(f):
        features_train = pd.concat([features_train, pd.read_csv(f)], axis=1)
print(features_train)

       numberOfBusstop  distance_to_competitor
0                 10.0                0.000000
1                 30.0               34.547236
2                 14.0                0.000000
3                 11.0                0.000000
4                 29.0                0.000000
...                ...                     ...
12854             27.0               27.846659
12855             12.0                0.000000
12856             13.0               42.202804
12857             35.0                0.000000
12858             10.0             2542.498404

[12859 rows x 2 columns]


In [93]:
# Inserting features from TEST into one dataset

folder_test = '../features_test_csv'

features_test = pd.read_csv(os.path.join(folder_test, os.listdir(folder_test)[0]))
for i in range(1,len(os.listdir(folder_test))):
    f = os.path.join(folder_test,os.listdir(folder_test)[i])
    if os.path.isfile(f):
        features_test = pd.concat([features_test, pd.read_csv(f)], axis=1)
print(features_test)

      numberOfBusstop  distance_to_competitor
0                21.0               70.085760
1                11.0              177.710439
2                11.0                0.000000
3                22.0              740.150349
4                21.0              472.323401
...               ...                     ...
8572              9.0             3313.883303
8573             28.0              302.293655
8574              6.0               85.988411
8575              8.0                0.000000
8576              4.0            18175.501583

[8577 rows x 2 columns]


In [94]:
# Target TRAIN data

target_train = pd.read_csv("../data/stores_train.csv")
target_train = target_train['revenue'].values

In [95]:
catboost_model = CatBoostRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    thread_count=-1,
    max_depth=6,
    silent=True,
    loss_function='RMSE',
    random_seed=42,
    bagging_temperature=0.3,
    od_type="Iter"
)

In [96]:
# Fit and predict

catboost_model.fit(features_train, target_train)

catboost_predictions = catboost_model.predict(features_test)




In [97]:
print(catboost_predictions)

[ 8.89389056 11.32500729  7.72549815 ...  5.51985939  7.12639976
  5.53186858]


In [98]:
# Get storeid as first column

stores_test = pd.read_csv("../data/stores_test.csv")
stores_test_id = stores_test['store_id'].values

# insert predictions as second column

predictions = pd.DataFrame()

predictions['id'] = stores_test_id
predictions['predicted'] = catboost_predictions
predictions.to_csv("../predictions/catboost1.csv", index=False)
print(predictions)

                              id  predicted
0     914206820-914239427-717245   8.893891
1     916789157-916823770-824309  11.325007
2       913341082-977479363-2948   7.725498
3      889682582-889697172-28720   9.377205
4     997991699-998006945-417222  12.442158
...                          ...        ...
8572  917323003-917383529-844309   9.490758
8573  917353379-917411824-845904   8.385034
8574  917072302-917089248-833647   5.519859
8575  916960557-916993161-829908   7.126400
8576   987280891-972040746-45320   5.531869

[8577 rows x 2 columns]


In [99]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    """
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))


X_train, X_test, y_train, y_test = train_test_split(features_train, target_train, test_size=0.2, random_state=42)

catboost_model2 = CatBoostRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    thread_count=-1,
    max_depth=6,
    silent=True,
    loss_function='RMSE',
    random_seed=42,
    bagging_temperature=0.3,
    od_type="Iter"
)

catboost_model2.fit(X_train, y_train)
y_pred = catboost_model2.predict(X_test)


# Calculate rmsle for a few example predictions 
y_true = y_test
n = len(features_train)
print('A couple of RMSLE scores computed over the train set')
print(f'Perfect prediction: {rmsle(y_true, y_true):.4f}')
print(f'All zeros prediciton: {rmsle(y_true, y_pred):.4f}')
print(f'All ones prediction: {rmsle(y_true, y_pred):.4f}')

A couple of RMSLE scores computed over the train set
Perfect prediction: 0.0000
All zeros prediciton: 1.2088
All ones prediction: 1.2088
