# CatBoost

In [68]:
from catboost import CatBoostRegressor
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

In [69]:
# Inserting features from TRAIN into one dataset

folder_train = '../features_train_in_use'

features_train = pd.read_csv(os.path.join(folder_train, os.listdir(folder_train)[0]))
for i in range(1,len(os.listdir(folder_train))):
    f = os.path.join(folder_train,os.listdir(folder_train)[i])
    if os.path.isfile(f):
        features_train = pd.concat([features_train, pd.read_csv(f)], axis=1)
print(features_train)

      categorical_chain_name categorical_mall_name categorical_plaace  \
0                  MCDONALDS     Magasinet Drammen            1.1.1.0   
1                  MCDONALDS                   NaN            1.1.1.0   
2                BURGER KING        Kuben Hønefoss            1.1.1.0   
3                BURGER KING     Glasshuspassasjen            1.1.1.0   
4                BURGER KING          Tillertorget            1.1.1.0   
...                      ...                   ...                ...   
12854                    NaN                   NaN           2.8.11.2   
12855       GULATING GRUPPEN             CC Gjøvik           2.8.11.2   
12856       GULATING GRUPPEN   Strømmen Storsenter           2.8.11.2   
12857                    NaN           Bystasjonen           2.8.11.2   
12858                    NaN                   NaN           2.8.11.2   

      categorical_sales_channel_name categorical_street  
0              Hamburger restaurants    BRAGERNES TORG   
1      

In [70]:
# Inserting features from TEST into one dataset

folder_test = '../features_test_in_use'

features_test = pd.read_csv(os.path.join(folder_test, os.listdir(folder_test)[0]))
for i in range(1,len(os.listdir(folder_test))):
    f = os.path.join(folder_test,os.listdir(folder_test)[i])
    if os.path.isfile(f):
        features_test = pd.concat([features_test, pd.read_csv(f)], axis=1)
print(features_test)

     categorical_chain_name categorical_mall_name categorical_plaace  \
0                       NaN                   NaN            1.1.1.0   
1               BURGER KING                   NaN            1.1.1.0   
2               BURGER KING        Stovner Senter            1.1.1.0   
3               BURGER KING                   NaN            1.1.1.0   
4          VULKAN BURGERBAR                   NaN            1.1.1.0   
...                     ...                   ...                ...   
8572       GULATING GRUPPEN                   NaN           2.8.11.2   
8573                    NaN                   NaN           2.8.11.2   
8574       GULATING GRUPPEN                   NaN           2.8.11.2   
8575                    NaN                   NaN           2.8.11.2   
8576                    NaN                   NaN           2.8.11.2   

     categorical_sales_channel_name    categorical_street  
0             Hamburger restaurants          STRØMSVEIEN   
1             H

In [71]:
# Target TRAIN data

target_train = pd.read_csv("../data/stores_train.csv")
target_train = target_train['revenue'].values

In [72]:
catboost_model = CatBoostRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    thread_count=-1,
    max_depth=6,
    silent=True,
    loss_function='RMSE',
    random_seed=42,
    bagging_temperature=0.3,
    od_type="Iter"
)

In [73]:
for column in features_train.columns:
    values=features_train[column].values
    for i in range(len(values)):
        if type(values[i])==float:
            values[i]="nan"

In [74]:
for column in features_test.columns:
    values=features_test[column].values
    for i in range(len(values)):
        if type(values[i])==float:
            values[i]="nan"

In [75]:
Pool_train= Pool(features_train,target_train,cat_features=['categorical_chain_name','categorical_mall_name','categorical_plaace','categorical_street','categorical_sales_channel_name'])

Pool_test= Pool(features_test,cat_features=['categorical_chain_name','categorical_mall_name','categorical_plaace','categorical_street','categorical_sales_channel_name'])

In [76]:
# Fit and predict

catboost_model.fit(Pool_train)

catboost_predictions = catboost_model.predict(Pool_test)




In [77]:
print(catboost_predictions)

[8.03126196 8.81382067 7.45847436 ... 5.57720274 5.36124002 2.84236442]


In [78]:
# Get storeid as first column

stores_test = pd.read_csv("../data/stores_test.csv")
stores_test_id = stores_test['store_id'].values

# insert predictions as second column

predictions = pd.DataFrame()

predictions['id'] = stores_test_id
predictions['predicted'] = catboost_predictions
predictions.to_csv("../predictions/catboost2.csv", index=False)
print(predictions)

                              id  predicted
0     914206820-914239427-717245   8.031262
1     916789157-916823770-824309   8.813821
2       913341082-977479363-2948   7.458474
3      889682582-889697172-28720  13.099146
4     997991699-998006945-417222  19.175931
...                          ...        ...
8572  917323003-917383529-844309   5.861089
8573  917353379-917411824-845904   5.301303
8574  917072302-917089248-833647   5.577203
8575  916960557-916993161-829908   5.361240
8576   987280891-972040746-45320   2.842364

[8577 rows x 2 columns]


In [79]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    """
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))


X_train, X_test, y_train, y_test = train_test_split(features_train, target_train, test_size=0.2, random_state=42)

catboost_model2 = CatBoostRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    thread_count=-1,
    max_depth=6,
    silent=True,
    loss_function='RMSE',
    random_seed=42,
    bagging_temperature=0.3,
    od_type="Iter"
)

catboost_model2.fit(X_train, y_train)
y_pred = catboost_model2.predict(X_test)


# Calculate rmsle for a few example predictions 
y_true = y_test
n = len(features_train)
print('A couple of RMSLE scores computed over the train set')
print(f'Perfect prediction: {rmsle(y_true, y_true):.4f}')
print(f'All zeros prediciton: {rmsle(y_true, y_pred):.4f}')
print(f'All ones prediction: {rmsle(y_true, y_pred):.4f}')

CatBoostError: Bad value for num_feature[non_default_doc_idx=2,feature_idx=0]="B-YOUNG": Cannot convert 'b'B-YOUNG'' to float