In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_csv("Hitters.csv")
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [10]:
df.dropna(inplace=True)

df = pd.get_dummies(df, columns=["League","Division","NewLeague"])

df.drop(["League_N", "Division_W", "NewLeague_N"], axis=1, inplace=True)

df.replace(to_replace=False, value=0, inplace=True)
df.replace(to_replace=True, value=1, inplace=True)

df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_A,Division_E,NewLeague_A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,0,0,0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,1,0,1
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,0,1,0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,0,1,0
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0,1,0,1


In [11]:
X = df.drop("Salary", axis=1)
y = df[["Salary"]]
X.shape, y.shape

((263, 19), (263, 1))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=16)

In [166]:
tree_reg = DecisionTreeRegressor().fit(X_train, y_train)

In [167]:
tree_reg.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [168]:
tree_reg.get_depth()

15

In [169]:
tree_reg.get_n_leaves()

178

In [170]:
y_pred = tree_reg.predict(X_test)

In [171]:
r2_score(y_true=y_test, y_pred=y_pred)

0.36511386689282854

In [222]:
np.sqrt(mean_squared_error(y_test, y_pred))

313.9157198394744

In [223]:
# Model Tuning

In [224]:
tree_params = {
    "max_depth": [2,3,4,5,10],
    "min_samples_split": [2, 10, 5, 50, 100],
    "max_leaf_nodes": [10, 15, 20, 30]
}

In [225]:
tree_cv = GridSearchCV(DecisionTreeRegressor(), tree_params, cv=5, n_jobs=-1, verbose=2)
tree_cv.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [226]:
tree_cv.best_params_

{'max_depth': 10, 'max_leaf_nodes': 30, 'min_samples_split': 50}

In [227]:
y_pred = tree_cv.predict(X_test)

In [228]:
r2_score(y_test, y_pred)

0.7039680307237872

In [229]:
np.sqrt(mean_squared_error(y_test, y_pred))

254.6068134123163