# Notebook to support tests for Catboost engine based on House Prices dataset

Source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [1]:
import catboost
import numbers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [2]:
ARTIFACTS_PATH = '../../artifacts/catboost/'
os.makedirs(ARTIFACTS_PATH, exist_ok=True) # Create path if not exists

## Load dataset

In [3]:
def load_data(df, data_columns, target_column, cat_columns):
    df = df[data_columns]
    
    df_columns = [c for c in df.columns if c != target_column]
    cat_columns_idx = [i for i, c in enumerate(df_columns) if c in cat_columns]
    cat_columns_names = [c for i, c in enumerate(df_columns) if i in cat_columns_idx]

    for col_name in df.columns:
        if col_name in cat_columns:
            df[col_name].replace(np.nan, '', inplace=True)
        else:
            df[col_name].replace(np.nan, None, inplace=True)
    
    X_data = df.loc[:, df.columns != target_column].to_numpy()
    y_data = df[[target_column]].to_numpy().reshape(-1)
    return df, X_data, y_data

In [4]:
df = pd.read_csv('../../data/house_prices/train.csv')
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
data_columns = df_train.columns
target_column = 'SalePrice'

print(df_train.select_dtypes)

# Get categorical columns
cat_columns = df_train.select_dtypes(exclude=["number","bool_","object_"]).columns

<bound method DataFrame.select_dtypes of         Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
254    255          20       RL         70.0     8400   Pave   NaN      Reg   
1066  1067          60       RL         59.0     7837   Pave   NaN      IR1   
638    639          30       RL         67.0     8777   Pave   NaN      Reg   
799    800          50       RL         60.0     7200   Pave   NaN      Reg   
380    381          50       RL         50.0     5000   Pave  Pave      Reg   
303    304          20       RL         70.0     9800   Pave   NaN      Reg   
86      87          60       RL        122.0    11911   Pave   NaN      IR2   
1385  1386          50       RM         40.0     5436   Pave   NaN      Reg   
265    266          20       RL         78.0    12090   Pave   NaN      IR1   
793    794          20       RL         76.0     9158   Pave   NaN      Reg   
1445  1446          85       RL         70.0     8400   Pave   NaN      Reg   
808    809 

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [None]:
df_train, X_train, y_train = load_data(df_train, data_columns, target_column, cat_columns)
df_train.head()

In [None]:
df_test, X_test, y_test = load_data(df_test, data_columns, target_column, cat_columns)
df_test.head()

## Train model

In [None]:
cat_columns_idx = [i for i, x in enumerate(df_train.loc[:, df_train.columns != target_column].dtypes) if x == np.object]

In [None]:
X_train[0]

In [None]:
train_data = catboost.Pool(data=X_train, label=y_train, cat_features=cat_columns_idx)

In [None]:
model = catboost.CatBoostRegressor(
    random_seed=42,
    eval_metric='RMSE',
    verbose=False,)

In [None]:
model.fit(train_data)

## Save & load

See: https://catboost.ai/docs/concepts/python-reference_catboost_save_model.html

In [None]:
output_file = os.path.join(ARTIFACTS_PATH, 'house_prices.cbm')

In [None]:
model.save_model(output_file, pool=train_data)

In [None]:
model = catboost.CatBoostRegressor() # Params not required
model.load_model(output_file)

## Predictions

According to the Kaggle's evaluation:

> Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred = y_pred.reshape(-1)

In [None]:
rmse = np.sqrt(np.mean((np.log(y_pred) - np.log(y_test)) ** 2))
print('RMSE: %.4f' % rmse)

**Note**: I run the notebook in Kaggle just to see the score in the leaderboard. This basic approach achieves the 724th position out of 5345 teams (i.e. the top 14%)!!