# CatBoost

In [1]:
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from sklearn import model_selection
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from skompiler import skompile
from sklearn.svm import SVR

import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
#!pip install catboost

In [3]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)


In [4]:
catb = CatBoostRegressor()
catb_model = catb.fit(X_train, y_train)

Learning rate set to 0.029229
0:	learn: 438.1942308	total: 59.3ms	remaining: 59.3s
1:	learn: 432.4080489	total: 63.5ms	remaining: 31.7s
2:	learn: 426.4284454	total: 66.5ms	remaining: 22.1s
3:	learn: 420.2654774	total: 68.6ms	remaining: 17.1s
4:	learn: 414.0515276	total: 70.8ms	remaining: 14.1s
5:	learn: 409.2442808	total: 72.4ms	remaining: 12s
6:	learn: 403.3717310	total: 73.9ms	remaining: 10.5s
7:	learn: 397.8646959	total: 75.6ms	remaining: 9.37s
8:	learn: 392.2419888	total: 76.9ms	remaining: 8.47s
9:	learn: 387.2019582	total: 79ms	remaining: 7.83s
10:	learn: 381.9852190	total: 81.4ms	remaining: 7.32s
11:	learn: 377.3994777	total: 83.6ms	remaining: 6.88s
12:	learn: 372.5056201	total: 85.3ms	remaining: 6.47s
13:	learn: 368.4425005	total: 86.8ms	remaining: 6.12s
14:	learn: 364.0366203	total: 88.4ms	remaining: 5.8s
15:	learn: 359.7061133	total: 90.1ms	remaining: 5.54s
16:	learn: 355.3448847	total: 91.7ms	remaining: 5.3s
17:	learn: 350.4851173	total: 93.4ms	remaining: 5.09s
18:	learn: 346

## Predict

In [5]:
y_pred = catb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

354.1424860950047

## Model Tuning

In [6]:
catb_grid = {
    'iterations': [200,500,1000,2000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'depth': [3,4,5,6,7,8] }

In [7]:
# catb = CatBoostRegressor()
# catb_cv_model = GridSearchCV(catb, catb_grid, cv=5, n_jobs = -1, verbose = 2)

In [8]:
# catb_cv_model.fit(X_train, y_train) ,, it takes too long time

In [9]:
# catb_cv_model.best_params_

In [10]:
catb_tuned = CatBoostRegressor(iterations = 200, 
                               learning_rate = 0.01, 
                               depth = 8)

catb_tuned = catb_tuned.fit(X_train,y_train)

0:	learn: 442.8337952	total: 7.64ms	remaining: 1.52s
1:	learn: 440.8040851	total: 11.9ms	remaining: 1.18s
2:	learn: 438.8533292	total: 15.8ms	remaining: 1.04s
3:	learn: 436.5659664	total: 20.9ms	remaining: 1.02s
4:	learn: 434.6453514	total: 24.8ms	remaining: 968ms
5:	learn: 432.4015036	total: 28.9ms	remaining: 935ms
6:	learn: 430.6290601	total: 32.8ms	remaining: 906ms
7:	learn: 428.6551156	total: 37.7ms	remaining: 905ms
8:	learn: 426.6587859	total: 42.9ms	remaining: 911ms
9:	learn: 424.5431478	total: 47ms	remaining: 893ms
10:	learn: 422.6170436	total: 51.1ms	remaining: 878ms
11:	learn: 420.6388059	total: 55.2ms	remaining: 864ms
12:	learn: 418.8071181	total: 59.2ms	remaining: 852ms
13:	learn: 417.1117792	total: 63.3ms	remaining: 841ms
14:	learn: 415.6398598	total: 68.4ms	remaining: 843ms
15:	learn: 413.8209241	total: 72.5ms	remaining: 834ms
16:	learn: 411.8433843	total: 76.8ms	remaining: 827ms
17:	learn: 409.9231341	total: 80.8ms	remaining: 817ms
18:	learn: 407.9543764	total: 85.1ms	rem

In [11]:
y_pred = catb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

370.3327259578433