In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

In [3]:
# Data
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

In [4]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/0d/40/bf9d114c94a1e58afd93ec7f35a839d0f766c10f22e38dc2b3df5b883cd2/xgboost-1.3.1-py3-none-win_amd64.whl (95.2MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.3.1


In [6]:
from xgboost import XGBRegressor

In [7]:
xgb = XGBRegressor().fit(X_train, y_train)

In [9]:
xgb

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=4, num_parallel_tree=1,
       objective='reg:squarederror', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [10]:
y_pred = xgb.predict(X_test)

In [11]:
np.sqrt(mean_squared_error(y_pred, y_test))

355.46515176059927

In [12]:
# Model Tuning

xgb = XGBRegressor()

In [13]:
xgb_params = {"learning_rate": [0.1, 0.5],
              "max_depth": [2, 3],
              "n_estimators": [100, 200],
              "colsample_bynode": [0.4, 0.7]}

In [14]:
xgb_cv = GridSearchCV(xgb, xgb_params, cv = 10, verbose = 2, n_jobs = -1).fit(X_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   24.4s finished


In [15]:
xgb_cv.best_params_

{'colsample_bynode': 0.4,
 'learning_rate': 0.5,
 'max_depth': 2,
 'n_estimators': 200}

In [16]:
xgb_tuned = XGBRegressor(colsample_bynode = 0.4,
                         learning_rate = 0.5,
                         max_depth = 2,
                         n_estimators = 200).fit(X_train, y_train)

In [17]:
y_pred = xgb_tuned.predict(X_test)

In [18]:
np.sqrt(mean_squared_error(y_pred, y_test))

380.0885533785223