### Goal
The goal of this notebook is to experiment with the CatBoost library as a method of model training. We will use the model with different levels of feature engineering.

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("bmh")
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

### Attempt 1
Only basic label encoding

In [24]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,26.50015,1
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.5,...,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.5,...,14,0,0,0,1.0,1.0,1.0,,3.899395,4


In [25]:
for col in ["State_Factor", "building_class", "facility_type"]:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])
    for label in np.unique(test[col]):
        if label not in encoder.classes_:
            encoder.classes_ = np.append(encoder.classes_, label)
    test[col] = encoder.transform(test[col])

In [26]:
train.head(-10)

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,0,0,13,61242.0,1942.0,11.0,2.4,36,50.500000,...,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,0,0,55,274000.0,1955.0,45.0,1.8,36,50.500000,...,14,0,0,0,1.0,,1.0,12.0,26.500150,1
2,1,0,0,48,280025.0,1951.0,97.0,1.8,36,50.500000,...,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,0,0,6,55325.0,1980.0,46.0,1.8,36,50.500000,...,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,0,0,56,66000.0,1985.0,100.0,2.4,36,50.500000,...,14,0,0,0,1.0,1.0,1.0,,3.899395,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75742,6,2,1,0,49970.0,2013.0,61.0,63.1,28,43.451613,...,25,3,0,0,,,,,24.693271,75742
75743,6,2,1,1,36353.0,2015.0,98.0,63.1,28,43.451613,...,25,3,0,0,,,,,31.613979,75743
75744,6,2,0,20,29084.0,1950.0,66.0,63.1,28,43.451613,...,25,3,0,0,,,,,55.977899,75744
75745,6,2,0,6,42128.0,1954.0,93.0,63.1,28,43.451613,...,25,3,0,0,,,,,40.464147,75745


In [27]:
X = train.drop(["id", "site_eui"], axis=1)
y = train["site_eui"]
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size = 0.2, random_state=0)

In [28]:
model = CatBoostRegressor(verbose=2000,
                          iterations=28000,
                          early_stopping_rounds=10,
                          random_seed=42,
                          max_depth=12,
                          task_type='GPU',
                          learning_rate=0.025,
                          eval_metric='RMSE',
                          loss_function='RMSE'
                        )

model.fit(X, y)

0:	learn: 57.7635307	total: 108ms	remaining: 50m 34s
2000:	learn: 28.3387417	total: 5m 2s	remaining: 1h 5m 35s
4000:	learn: 23.1246166	total: 19m 22s	remaining: 1h 56m 12s
6000:	learn: 20.0388544	total: 25m 57s	remaining: 1h 35m 10s
8000:	learn: 17.8944262	total: 31m 19s	remaining: 1h 18m 16s
10000:	learn: 16.3837064	total: 37m 57s	remaining: 1h 8m 18s
12000:	learn: 15.2307740	total: 45m 4s	remaining: 1h 4s
14000:	learn: 14.3138092	total: 53m 2s	remaining: 53m 1s
16000:	learn: 13.5610531	total: 1h 1m 14s	remaining: 45m 55s
18000:	learn: 12.9359685	total: 1h 11m 23s	remaining: 39m 39s
20000:	learn: 12.4180378	total: 1h 17m 11s	remaining: 30m 52s


KeyboardInterrupt: 