# CATEGORY BOOSTING ( CATBOOST )

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn import model_selection

In [3]:
df = pd.read_csv("verisetleri\Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.24.2-cp38-none-win_amd64.whl (65.3 MB)
Collecting graphviz
  Downloading graphviz-0.14.2-py2.py3-none-any.whl (18 kB)
Collecting plotly
  Downloading plotly-4.12.0-py2.py3-none-any.whl (13.1 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Using legacy 'setup.py install' for retrying, since package 'wheel' is not installed.
Installing collected packages: graphviz, retrying, plotly, catboost
    Running setup.py install for retrying: started
    Running setup.py install for retrying: finished with status 'done'
Successfully installed catboost-0.24.2 graphviz-0.14.2 plotly-4.12.0 retrying-1.3.3


In [5]:
from catboost import CatBoostRegressor

In [6]:
catb_model = CatBoostRegressor().fit(X_train, y_train)

maining: 681ms
623:	learn: 38.7450983	total: 1.13s	remaining: 679ms
624:	learn: 38.6788090	total: 1.13s	remaining: 677ms
625:	learn: 38.5788475	total: 1.13s	remaining: 675ms
626:	learn: 38.4675366	total: 1.13s	remaining: 673ms
627:	learn: 38.2350724	total: 1.13s	remaining: 671ms
628:	learn: 38.2074221	total: 1.14s	remaining: 670ms
629:	learn: 38.0622880	total: 1.14s	remaining: 668ms
630:	learn: 37.8520281	total: 1.14s	remaining: 666ms
631:	learn: 37.6631204	total: 1.14s	remaining: 664ms
632:	learn: 37.5153877	total: 1.14s	remaining: 662ms
633:	learn: 37.3834892	total: 1.14s	remaining: 660ms
634:	learn: 37.2552166	total: 1.14s	remaining: 658ms
635:	learn: 37.0779047	total: 1.15s	remaining: 656ms
636:	learn: 37.0329263	total: 1.15s	remaining: 655ms
637:	learn: 36.8927982	total: 1.15s	remaining: 653ms
638:	learn: 36.8105498	total: 1.15s	remaining: 652ms
639:	learn: 36.7512624	total: 1.16s	remaining: 650ms
640:	learn: 36.5891302	total: 1.16s	remaining: 648ms
641:	learn: 36.4551656	total: 1

In [7]:
y_pred = catb_model.predict(X_test)

In [8]:
np.sqrt(mean_squared_error(y_test, y_pred))

350.2683163098795

In [None]:
## MODEL TUNNING

In [9]:
# iterations --> Ağaç sayısı, fit edilecek olan model sayısısdır.
# catboost kişisel bilgisayarda uygulamaya çalışırken en fazla işlemleri uzun süren algoritma olacak. Bu sebeple mümkün olduğu kadar az yazmak gerekir. Parametleri bir kaç tane arttırınca işlemler yaklaşık 45 dakika filan sürmektedir.
catb_params = {"iterations" : [200, 500, 1000], "learning_rate" : [0.01, 0.1], "depth" : [3, 6, 8]}

In [10]:
catb_model = CatBoostRegressor()

In [11]:
catb_cv_model = GridSearchCV(catb_model, catb_params, cv=5, n_jobs=-1, verbose=2).fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  2.0min finished
0:	learn: 425.7900818	total: 2.08ms	remaining: 414ms
1:	learn: 404.8723520	total: 2.84ms	remaining: 282ms
2:	learn: 387.4057666	total: 3.47ms	remaining: 228ms
3:	learn: 372.2801584	total: 4.08ms	remaining: 200ms
4:	learn: 358.9204229	total: 4.66ms	remaining: 182ms
5:	learn: 347.0083933	total: 5.23ms	remaining: 169ms
6:	learn: 336.0130818	total: 5.81ms	remaining: 160ms
7:	learn: 324.3923300	total: 6.37ms	remaining: 153ms
8:	learn: 314.8690957	total: 6.94ms	remaining: 147ms
9:	learn: 308.5075563	total: 7.51ms	remaining: 143ms
10:	learn: 298.8587285	total: 8.08ms	remaining: 139ms
11:	learn: 294.7655438	total: 8.78ms	remaining: 138ms
12:	learn: 288.0697862	total: 9.32ms	remaining: 134ms
13:	learn: 282.6697154	total: 9.87ms	r

In [12]:
catb_cv_model.best_params_

{'depth': 3, 'iterations': 200, 'learning_rate': 0.1}

In [20]:
catb_tuned = CatBoostRegressor(depth=3, iterations=200, learning_rate=0.1).fit(X_train, y_train)

0:	learn: 425.7900818	total: 931us	remaining: 185ms
1:	learn: 404.8723520	total: 1.76ms	remaining: 174ms
2:	learn: 387.4057666	total: 2.52ms	remaining: 166ms
3:	learn: 372.2801584	total: 3.32ms	remaining: 163ms
4:	learn: 358.9204229	total: 4.24ms	remaining: 165ms
5:	learn: 347.0083933	total: 5.05ms	remaining: 163ms
6:	learn: 336.0130818	total: 5.7ms	remaining: 157ms
7:	learn: 324.3923300	total: 6.32ms	remaining: 152ms
8:	learn: 314.8690957	total: 6.93ms	remaining: 147ms
9:	learn: 308.5075563	total: 7.51ms	remaining: 143ms
10:	learn: 298.8587285	total: 8.1ms	remaining: 139ms
11:	learn: 294.7655438	total: 8.67ms	remaining: 136ms
12:	learn: 288.0697862	total: 9.26ms	remaining: 133ms
13:	learn: 282.6697154	total: 9.96ms	remaining: 132ms
14:	learn: 277.6121667	total: 10.5ms	remaining: 130ms
15:	learn: 273.4383979	total: 11.2ms	remaining: 129ms
16:	learn: 269.1556201	total: 11.8ms	remaining: 127ms
17:	learn: 264.8098704	total: 12.4ms	remaining: 125ms
18:	learn: 261.6700768	total: 12.9ms	rema

In [21]:
y_pred = catb_tuned.predict(X_test)

In [22]:
np.sqrt(mean_squared_error(y_test, y_pred))

344.3125832615482

In [None]:
# Önceki algoritmalara göre en iyi sonucu buldu.