In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Hitters.csv")
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [3]:
df.dropna(inplace=True)

df = pd.get_dummies(df, columns=["League","Division","NewLeague"])

df.drop(["League_N", "Division_W", "NewLeague_N"], axis=1, inplace=True)

df.replace(to_replace=False, value=0, inplace=True)
df.replace(to_replace=True, value=1, inplace=True)

df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_A,Division_E,NewLeague_A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,0,0,0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,1,0,1
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,0,1,0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,0,1,0
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0,1,0,1


In [4]:
X = df.drop("Salary", axis=1)
y = df[["Salary"]]
X.shape, y.shape

((263, 19), (263, 1))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=16)

In [6]:
catb_model = CatBoostRegressor().fit(X_train, y_train)

Learning rate set to 0.031674
0:	learn: 437.6454375	total: 133ms	remaining: 2m 12s
1:	learn: 431.0814417	total: 134ms	remaining: 1m 6s
2:	learn: 425.6859114	total: 135ms	remaining: 44.8s
3:	learn: 420.1858503	total: 136ms	remaining: 33.8s
4:	learn: 414.4368374	total: 137ms	remaining: 27.2s
5:	learn: 409.0554904	total: 138ms	remaining: 22.8s
6:	learn: 403.8732170	total: 139ms	remaining: 19.7s
7:	learn: 398.7672920	total: 140ms	remaining: 17.3s
8:	learn: 393.6567417	total: 141ms	remaining: 15.5s
9:	learn: 389.6108815	total: 142ms	remaining: 14s
10:	learn: 384.3451655	total: 143ms	remaining: 12.8s
11:	learn: 380.8900698	total: 143ms	remaining: 11.8s
12:	learn: 376.1725901	total: 144ms	remaining: 10.9s
13:	learn: 372.3768038	total: 145ms	remaining: 10.2s
14:	learn: 368.1690881	total: 146ms	remaining: 9.58s
15:	learn: 363.7631693	total: 147ms	remaining: 9.03s
16:	learn: 359.8002720	total: 148ms	remaining: 8.54s
17:	learn: 356.3939405	total: 149ms	remaining: 8.1s
18:	learn: 352.4822648	total

In [10]:
catb_model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'lear

In [12]:
y_pred = catb_model.predict(X_test)

In [13]:
r2_score(y_test, y_pred)

0.7082024449832243

In [14]:
np.sqrt(mean_squared_error(y_test, y_pred))

252.7793184881049

In [15]:
# Model Tuning

In [17]:
catb_params = {
    "iterations": [500, 750, 1000],
    "learning_rate": [0.01, 0.1, 0.3],
    "depth": [2, 3, 5]
}

In [19]:
catb_cv = GridSearchCV(CatBoostRegressor(), catb_params, cv=5, n_jobs=-1, verbose=2)
catb_cv.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
0:	learn: 442.2369517	total: 487us	remaining: 487ms
1:	learn: 440.4950102	total: 859us	remaining: 429ms
2:	learn: 438.6756394	total: 1.17ms	remaining: 389ms
3:	learn: 436.9419066	total: 1.46ms	remaining: 363ms
4:	learn: 435.3530008	total: 1.76ms	remaining: 350ms
5:	learn: 433.6608487	total: 2.07ms	remaining: 344ms
6:	learn: 432.1139175	total: 2.45ms	remaining: 348ms
7:	learn: 430.4152147	total: 2.77ms	remaining: 344ms
8:	learn: 428.7131376	total: 3.06ms	remaining: 337ms
9:	learn: 427.2770012	total: 3.42ms	remaining: 338ms
10:	learn: 425.7291045	total: 3.8ms	remaining: 342ms
11:	learn: 424.5795237	total: 4.12ms	remaining: 339ms
12:	learn: 422.8131083	total: 4.4ms	remaining: 334ms
13:	learn: 421.1795514	total: 4.73ms	remaining: 333ms
14:	learn: 419.7946106	total: 5.01ms	remaining: 329ms
15:	learn: 418.4165555	total: 5.32ms	remaining: 327ms
16:	learn: 417.2823706	total: 5.61ms	remaining: 325ms
17:	learn: 415.8923598	total: 5.89

In [20]:
catb_cv.best_params_

{'depth': 3, 'iterations': 1000, 'learning_rate': 0.01}

In [21]:
catb_model = CatBoostRegressor(depth=3, iterations=1000, learning_rate=0.01)
catb_model.fit(X_train, y_train)

0:	learn: 442.2369517	total: 410us	remaining: 411ms
1:	learn: 440.4950102	total: 781us	remaining: 390ms
2:	learn: 438.6756394	total: 1.26ms	remaining: 418ms
3:	learn: 436.9419066	total: 1.65ms	remaining: 412ms
4:	learn: 435.3530008	total: 2.24ms	remaining: 446ms
5:	learn: 433.6608487	total: 2.66ms	remaining: 441ms
6:	learn: 432.1139175	total: 3.03ms	remaining: 430ms
7:	learn: 430.4152147	total: 3.48ms	remaining: 432ms
8:	learn: 428.7131376	total: 4.02ms	remaining: 443ms
9:	learn: 427.2770012	total: 4.36ms	remaining: 432ms
10:	learn: 425.7291045	total: 4.94ms	remaining: 444ms
11:	learn: 424.5795237	total: 5.78ms	remaining: 476ms
12:	learn: 422.8131083	total: 6.28ms	remaining: 477ms
13:	learn: 421.1795514	total: 6.7ms	remaining: 472ms
14:	learn: 419.7946106	total: 7.14ms	remaining: 469ms
15:	learn: 418.4165555	total: 7.63ms	remaining: 469ms
16:	learn: 417.2823706	total: 8.03ms	remaining: 464ms
17:	learn: 415.8923598	total: 8.46ms	remaining: 461ms
18:	learn: 414.4970081	total: 8.84ms	rema

<catboost.core.CatBoostRegressor at 0x2755758df10>

In [22]:
y_pred = catb_model.predict(X_test)

In [23]:
r2_score(y_test, y_pred)

0.7225644290496579

In [24]:
np.sqrt(mean_squared_error(y_test, y_pred))

246.48005695027922