# Notebook Contents

- [Imports](#Imports)
- [Data](#Data)
- [Data Cleaning](#Data-Cleaning)
- [Preprocessing](#Preprocessing)
    - [Multicolinearity - VIF](#Multicolinearity---VIF)
- [Features](#Features)
- [Random Forest Modeling](#Random-Forest-Modeling)
    - [4-Seam](#Linear-Regression---4-Seam)
    - [Cutter](#Linear-Regression---Cutter)
    - [Sinker](#Linear-Regression---Sinker)
    - [Slider](#Linear-Regression---Slider)
    - [Curveball](#Linear-Regression---Curveball)
    - [Changeup](#Linear-Regression---Changeup)

# Imports

In [1]:
from pprint import pprint
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, k_means
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

# Data

In [2]:
data = pd.read_csv('../data/model-pitches.csv', index_col = [0])
data.dropna(subset = ['pitch_type', 'velo', 'spin_rate', 'pfx_-x', 
                      'release_extension', 'delta_run_exp'], inplace = True)

pd.set_option('max_columns', None)
print(data.shape)
data.head(5)

(705403, 46)


Unnamed: 0,player_name,p_throws,pitch_type,velo,spin_rate,spin_axis,pfx_-x,pfx_z,bauer_units,effective_speed,release_pos_x,release_pos_z,release_extension,release_pos_y,plate_-x,plate_x,plate_z,type,balls,strikes,delta_run_exp,stand,description,events,hit_distance_sc,exit_velo,launch_angle,launch_speed_angle,xba,xwobacon,woba_value,woba_denom,babip_value,iso_value,at_bat_number,pitch_number,inning,inning_topbot,home_score,away_score,post_home_score,post_away_score,on_1b,on_2b,on_3b,outs_when_up
0,"Smith, Will",L,FF,92.3,2330.0,148.0,-8.28,16.56,25.24377,92.8,1.4,6.8,6.5,54.03,0.69,-0.69,2.83,X,1,2,-0.073,R,hit_into_play,field_out,13.0,95.2,-13.0,2.0,0.174,0.158,0.0,1.0,0.0,0.0,61,4,9,Top,5,0,5,0,,,,2
1,"Smith, Will",L,SL,80.6,2254.0,315.0,9.24,5.76,27.965261,81.2,1.6,6.64,6.4,54.15,0.71,-0.71,2.62,S,1,1,-0.027,R,foul,,108.0,75.3,75.0,,,,,,,,61,3,9,Top,5,0,5,0,,,,2
2,"Smith, Will",L,CU,75.5,1940.0,328.0,7.8,-6.12,25.695364,75.2,1.46,6.88,6.2,54.34,0.04,-0.04,2.46,S,1,0,-0.02,R,foul,,157.0,83.5,65.0,,,,,,,,61,2,9,Top,5,0,5,0,,,,2
3,"Smith, Will",L,CU,75.0,2017.0,330.0,8.28,-8.28,26.893333,74.5,1.53,6.83,5.9,54.61,-2.1,2.1,3.89,B,0,0,0.016,R,ball,,,,,,,,,,,,61,1,9,Top,5,0,5,0,,,,2
4,"Smith, Will",L,FF,91.2,2281.0,143.0,-7.56,15.36,25.010965,90.9,1.49,6.66,6.3,54.15,0.31,-0.31,2.8,X,1,0,-0.189,L,hit_into_play,field_out,9.0,93.3,-18.0,2.0,0.1,0.09,0.0,1.0,0.0,0.0,60,2,9,Top,5,0,5,0,,,,1


In [3]:
data.loc[19:25]

Unnamed: 0,player_name,p_throws,pitch_type,velo,spin_rate,spin_axis,pfx_-x,pfx_z,bauer_units,effective_speed,release_pos_x,release_pos_z,release_extension,release_pos_y,plate_-x,plate_x,plate_z,type,balls,strikes,delta_run_exp,stand,description,events,hit_distance_sc,exit_velo,launch_angle,launch_speed_angle,xba,xwobacon,woba_value,woba_denom,babip_value,iso_value,at_bat_number,pitch_number,inning,inning_topbot,home_score,away_score,post_home_score,post_away_score,on_1b,on_2b,on_3b,outs_when_up
19,"Gsellman, Robert",R,FF,93.4,1999.0,241.0,8.16,10.68,21.40257,93.8,-1.71,5.85,6.3,54.22,1.61,-1.61,4.07,B,0,2,0.022,L,ball,,,,,,,,,,,,56,3,8,Bot,5,0,5,0,,,,0
20,"Gsellman, Robert",R,CU,80.6,2660.0,62.0,-8.4,-5.64,33.002481,80.9,-1.64,5.72,6.2,54.31,0.92,-0.92,2.61,S,0,1,-0.052,L,called_strike,,,,,,,,,,,,56,2,8,Bot,5,0,5,0,,,,0
21,"Gsellman, Robert",R,SI,93.7,2028.0,234.0,16.44,6.24,21.643543,93.4,-1.69,5.76,6.2,54.31,0.68,-0.68,2.42,S,0,0,-0.038,L,foul,,228.0,81.6,60.0,,,,,,,,56,1,8,Bot,5,0,5,0,,,,0
22,"Martin, Chris",R,CH,87.9,1160.0,234.0,7.68,1.8,13.196815,88.8,-2.98,6.23,6.7,53.81,0.36,-0.36,1.26,S,1,2,-0.073,L,swinging_strike,strikeout,,,,,,,0.0,1.0,0.0,0.0,55,5,8,Top,5,0,5,0,,,,2
23,"Martin, Chris",R,SL,84.9,2593.0,45.0,-6.6,-10.8,30.541814,84.9,-3.02,6.35,6.3,54.2,1.18,-1.18,1.25,S,1,2,0.0,L,foul,,1.0,57.9,-52.0,,,,,,,,55,4,8,Top,5,0,5,0,,,,2
24,"Martin, Chris",R,FF,95.3,2252.0,205.0,3.24,13.56,23.63064,96.4,-2.58,6.42,6.7,53.82,0.37,-0.37,2.75,S,1,1,-0.027,L,foul,,200.0,72.1,24.0,,,,,,,,55,3,8,Top,5,0,5,0,,,,2
25,"Martin, Chris",R,CH,89.2,1682.0,231.0,12.24,6.6,18.856502,89.9,-2.98,6.22,6.6,53.86,0.25,-0.25,0.7,B,0,1,0.014,L,ball,,,,,,,,,,,,55,2,8,Top,5,0,5,0,,,,2


# Data Cleaning

In [4]:
data['inning_topbot'] = data.inning_topbot.map({'Top': 0, 'Bot': 1})
data['on_1b'] = [1 if x > 1 else 0 for x in data['on_1b']]
data['on_2b'] = [1 if x > 1 else 0 for x in data['on_2b']]
data['on_3b'] = [1 if x > 1 else 0 for x in data['on_3b']]

data['home_runs'] = data['post_home_score'] - data['home_score']
data['away_runs'] = data['post_away_score'] - data['away_score']
data['runs'] = data['home_runs'] + data['away_runs']

In [5]:
zero_outs = data.loc[data['outs_when_up'] == 0]
print('0 outs:', zero_outs.shape)
one_out = data.loc[data['outs_when_up'] == 1]
print('1 out:', one_out.shape)
two_outs = data.loc[data['outs_when_up'] == 2]
print('2 outs:', two_outs.shape)

0 outs: (244451, 49)
1 out: (232568, 49)
2 outs: (228384, 49)


In [6]:
re = pd.read_csv('../data/run_expectancy_table.csv', index_col = [0])
data = pd.merge(data, re, how = 'left', on = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up'])

data['re_end_state'] = data['delta_run_exp'] + data['re']
data['re24'] = data['delta_run_exp'] + data['runs']

print(data.shape)
data.head()

(705403, 52)


Unnamed: 0,player_name,p_throws,pitch_type,velo,spin_rate,spin_axis,pfx_-x,pfx_z,bauer_units,effective_speed,release_pos_x,release_pos_z,release_extension,release_pos_y,plate_-x,plate_x,plate_z,type,balls,strikes,delta_run_exp,stand,description,events,hit_distance_sc,exit_velo,launch_angle,launch_speed_angle,xba,xwobacon,woba_value,woba_denom,babip_value,iso_value,at_bat_number,pitch_number,inning,inning_topbot,home_score,away_score,post_home_score,post_away_score,on_1b,on_2b,on_3b,outs_when_up,home_runs,away_runs,runs,re,re_end_state,re24
0,"Smith, Will",L,FF,92.3,2330.0,148.0,-8.28,16.56,25.24377,92.8,1.4,6.8,6.5,54.03,0.69,-0.69,2.83,X,1,2,-0.073,R,hit_into_play,field_out,13.0,95.2,-13.0,2.0,0.174,0.158,0.0,1.0,0.0,0.0,61,4,9,0,5,0,5,0,0,0,0,2,0,0,0,0.098,0.025,-0.073
1,"Smith, Will",L,SL,80.6,2254.0,315.0,9.24,5.76,27.965261,81.2,1.6,6.64,6.4,54.15,0.71,-0.71,2.62,S,1,1,-0.027,R,foul,,108.0,75.3,75.0,,,,,,,,61,3,9,0,5,0,5,0,0,0,0,2,0,0,0,0.098,0.071,-0.027
2,"Smith, Will",L,CU,75.5,1940.0,328.0,7.8,-6.12,25.695364,75.2,1.46,6.88,6.2,54.34,0.04,-0.04,2.46,S,1,0,-0.02,R,foul,,157.0,83.5,65.0,,,,,,,,61,2,9,0,5,0,5,0,0,0,0,2,0,0,0,0.098,0.078,-0.02
3,"Smith, Will",L,CU,75.0,2017.0,330.0,8.28,-8.28,26.893333,74.5,1.53,6.83,5.9,54.61,-2.1,2.1,3.89,B,0,0,0.016,R,ball,,,,,,,,,,,,61,1,9,0,5,0,5,0,0,0,0,2,0,0,0,0.098,0.114,0.016
4,"Smith, Will",L,FF,91.2,2281.0,143.0,-7.56,15.36,25.010965,90.9,1.49,6.66,6.3,54.15,0.31,-0.31,2.8,X,1,0,-0.189,L,hit_into_play,field_out,9.0,93.3,-18.0,2.0,0.1,0.09,0.0,1.0,0.0,0.0,60,2,9,0,5,0,5,0,0,0,0,1,0,0,0,0.254,0.065,-0.189


# Preprocessing

### Multicolinearity - VIF
**Independent Variables:** Velocity, Spin Rate, VB, HB, Release Extension, Horizontal Release Position, Vertical Release Position, Horizontal Plate Coords, Vertical Plate Coords

**Dependent Variable:** re24


In [7]:
features = data[['velo', 'spin_rate', 'pfx_-x', 'pfx_z', 'release_extension', 
                 'release_pos_x', 'release_pos_z', 'plate_x', 'plate_z', 're24',
                 'pitch_type', 'p_throws', 'stand']]
features_vif = features.select_dtypes([np.number])
vif_data = pd.DataFrame()
vif_data["feature"] = features_vif.columns

vif_data["VIF"] = [variance_inflation_factor(features_vif.values, i)
                   for i in range(len(features_vif.columns))]

vif_data.sort_values(by = 'VIF').head(10)

Unnamed: 0,feature,VIF
9,re24,1.005373
7,plate_x,1.110173
5,release_pos_x,1.454753
2,pfx_-x,1.466129
3,pfx_z,3.113887
8,plate_z,7.448817
1,spin_rate,50.829646
6,release_pos_z,103.363589
4,release_extension,152.713637
0,velo,277.718572


# Features

In [8]:
ff = features.loc[features['pitch_type'] == 'FF']
fc = features.loc[features['pitch_type'] == 'FC']
fastball = ff.append(fc)
si = features.loc[features['pitch_type'] == 'SI']
fastball = fastball.append(si)
print('Fastball shape:', fastball.shape)
sl = features.loc[features['pitch_type'] == 'SL']
cu = features.loc[features['pitch_type'] == 'CU']
breaking_ball = sl.append(cu)
kc = features.loc[features['pitch_type'] == 'KC']
breaking_ball = breaking_ball.append(kc)
print('Breaking Ball:', breaking_ball.shape)
ch = features.loc[features['pitch_type'] == 'CH']
fs = features.loc[features['pitch_type'] == 'FS']
offspeed = ch.append(fs)
print('Off speed shape:', offspeed.shape)
rhp = features.loc[features['p_throws'] == 'R']
print('RHP shape:', rhp.shape)
lhp = features.loc[features['p_throws'] == 'L']
print('LHP shape:', lhp.shape)
rhp_rhh = features.loc[(features['p_throws'] == 'R') & (features['stand'] == 'R')]
print('RHP & RHH shape:', rhp_rhh.shape)
rhp_lhh = features.loc[(features['p_throws'] == 'R') & (features['stand'] == 'L')]
print('RHP & LHH shape:', rhp_lhh.shape)
lhp_rhh = features.loc[(features['p_throws'] == 'L') & (features['stand'] == 'R')]
print('LHP & RHH shape:', lhp_rhh.shape)
lhp_lhh = features.loc[(features['p_throws'] == 'L') & (features['stand'] == 'L')]
print('LHP & LHH shape:', lhp_lhh.shape)
rhp_fastball = fastball.loc[fastball['p_throws'] == 'R']
print('RHP Fastball shape:', rhp_fastball.shape)
lhp_fastball = fastball.loc[fastball['p_throws'] == 'L']
print('LHP Fastball shape:', lhp_fastball.shape)
rhp_breaking_ball = breaking_ball.loc[breaking_ball['p_throws'] == 'R']
print('RHP Breaking Ball shape:', rhp_breaking_ball.shape)
lhp_breaking_ball = breaking_ball.loc[breaking_ball['p_throws'] == 'L']
print('LHP Breaking Ball shape:', lhp_breaking_ball.shape)
rhp_offspeed = offspeed.loc[offspeed['p_throws'] == 'R']
print('RHP Offspeed shape:', rhp_offspeed.shape)
lhp_offspeed = offspeed.loc[offspeed['p_throws'] == 'L']
print('LHP Offspeed shape:', lhp_offspeed.shape)

Fastball shape: (406259, 13)
Breaking Ball: (207982, 13)
Off speed shape: (91162, 13)
RHP shape: (496498, 13)
LHP shape: (208905, 13)
RHP & RHH shape: (267548, 13)
RHP & LHH shape: (228950, 13)
LHP & RHH shape: (149824, 13)
LHP & LHH shape: (59081, 13)
RHP Fastball shape: (283224, 13)
LHP Fastball shape: (123035, 13)
RHP Breaking Ball shape: (152383, 13)
LHP Breaking Ball shape: (55599, 13)
RHP Offspeed shape: (60891, 13)
LHP Offspeed shape: (30271, 13)


# Linear Regression Model

## Fastball RHP

#### 4-Seam, Cutter, Sinker

In [9]:
features_fastball_r = rhp_fastball.select_dtypes([np.number])
X = features_fastball_r.drop(columns = ['re24'])
X = sm.add_constant(X)
y = features_fastball_r['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)

ols_fastball_r = sm.OLS(y_train, X_train).fit()
pred_fastball_r = ols_fastball_r.predict(X_test)
fitted_vals_fastball_r = ols_fastball_r.fittedvalues
residuals_fastball_r = ols_fastball_r.resid

print('MSE:', round(metrics.mean_squared_error(y_test, pred_fastball_r), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, pred_fastball_r)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, pred_fastball_r), 4))
print(ols_fastball_r.summary())

MSE: 0.1833
RMSE: 0.4281
MAE: 0.163
                            OLS Regression Results                            
Dep. Variable:                   re24   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     10.42
Date:                Sun, 06 Mar 2022   Prob (F-statistic):           2.84e-16
Time:                        16:10:07   Log-Likelihood:            -1.2339e+05
No. Observations:              212418   AIC:                         2.468e+05
Df Residuals:                  212408   BIC:                         2.469e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
co

## Fastball LHP

#### 4-Seam, Cutter, Sinker

In [10]:
features_fastball_l = lhp_fastball.select_dtypes([np.number])
X = features_fastball_l.drop(columns = ['re24'])
X = sm.add_constant(X)
y = features_fastball_l['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)

ols_fastball_l = sm.OLS(y_train, X_train).fit()
pred_fastball_l = ols_fastball_l.predict(X_test)
fitted_vals_fastball_l = ols_fastball_l.fittedvalues
residuals_fastball_l = ols_fastball_l.resid

print('MSE:', round(metrics.mean_squared_error(y_test, pred_fastball_l), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, pred_fastball_l)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, pred_fastball_l), 4))
print(ols_fastball_l.summary())

MSE: 0.1866
RMSE: 0.4319
MAE: 0.1644
                            OLS Regression Results                            
Dep. Variable:                   re24   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.903
Date:                Sun, 06 Mar 2022   Prob (F-statistic):             0.0467
Time:                        16:10:07   Log-Likelihood:                -50066.
No. Observations:               92276   AIC:                         1.002e+05
Df Residuals:                   92266   BIC:                         1.002e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
c

## Breaking Ball RHP

#### Slider, Curveball, Knuckle Curve

## Breaking Ball LHP

#### Slider, Curveball, Knuckle Curve

## Offspeed RHP

#### Changeup, Splitter

## Offspeed LHP

#### Changeup, Splitter

# Random Forest Model

## Fastball RHP

#### 4-Seam, Cutter, Sinker

In [11]:
features_fastball_r = rhp_fastball.select_dtypes([np.number])
X = features_fastball_r.drop(columns = ['re24'])
y = features_fastball_r['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y)

params = {
    'n_estimators': [25, 75],
    'max_depth': [None, 5, 6, 7, 8],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [2, 5, 10],
    'n_jobs': [-1]
}
    
gs = GridSearchCV(RandomForestRegressor(random_state = 10), param_grid = params)
gs.fit(X_train, y_train)
fastball_pred_r = gs.predict(X_test)

print('Cross val score:', round((gs.best_score_), 4))
print('MSE:', round(metrics.mean_squared_error(y_test, fastball_pred_r), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, fastball_pred_r)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, fastball_pred_r), 4))
gs.best_params_

Cross val score: 0.0038
MSE: 0.187
RMSE: 0.4324
MAE: 0.1603


{'max_depth': 8,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 75,
 'n_jobs': -1}

## Fastball LHP

#### 4-Seam, Cutter, Sinker

In [12]:
features_fastball_l = lhp_fastball.select_dtypes([np.number])
X = features_fastball_l.drop(columns = ['re24'])
y = features_fastball_l['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y)

params = {
    'n_estimators': [25, 75],
    'max_depth': [None, 5, 6, 7, 8],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [2, 5, 10],
    'n_jobs': [-1]
}
    
gs = GridSearchCV(RandomForestRegressor(random_state = 10), param_grid = params)
gs.fit(X_train, y_train)
fastball_pred_l = gs.predict(X_test)

print('Cross val score:', round((gs.best_score_), 4))
print('MSE:', round(metrics.mean_squared_error(y_test, fastball_pred_l), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, fastball_pred_l)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, fastball_pred_l), 4))
gs.best_params_

Cross val score: 0.0027
MSE: 0.1666
RMSE: 0.4081
MAE: 0.1581


{'max_depth': 8,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 75,
 'n_jobs': -1}

## Breaking Ball RHP

#### Slider, Curveball, Knuckle Curve

In [13]:
features_bb_r = rhp_breaking_ball.select_dtypes([np.number])
X = features_bb_r.drop(columns = ['re24'])
y = features_bb_r['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y)

params = {
    'n_estimators': [25, 75],
    'max_depth': [None, 5, 6, 7, 8],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [2, 5, 10],
    'n_jobs': [-1]
}
    
gs = GridSearchCV(RandomForestRegressor(random_state = 10), param_grid = params)
gs.fit(X_train, y_train)
bb_pred_r = gs.predict(X_test)

print('Cross val score:', round((gs.best_score_), 4))
print('MSE:', round(metrics.mean_squared_error(y_test, bb_pred_r), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, bb_pred_r)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, bb_pred_r), 4))
gs.best_params_

Cross val score: 0.0048
MSE: 0.1724
RMSE: 0.4152
MAE: 0.1563


{'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 75,
 'n_jobs': -1}

## Breaking Ball LHP

#### Slider, Curveball, Knuckle Curve

In [14]:
features_bb_l = lhp_breaking_ball.select_dtypes([np.number])
X = features_bb_l.drop(columns = ['re24'])
y = features_bb_l['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y)

params = {
    'n_estimators': [25, 75],
    'max_depth': [None, 5, 6, 7, 8],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [2, 5, 10],
    'n_jobs': [-1]
}
    
gs = GridSearchCV(RandomForestRegressor(random_state = 10), param_grid = params)
gs.fit(X_train, y_train)
bb_pred_l = gs.predict(X_test)

print('Cross val score:', round((gs.best_score_), 4))
print('MSE:', round(metrics.mean_squared_error(y_test, bb_pred_l), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, bb_pred_l)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, bb_pred_l), 4))
gs.best_params_

Cross val score: 0.0059
MSE: 0.1487
RMSE: 0.3856
MAE: 0.1509


{'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 75,
 'n_jobs': -1}

## Offspeed RHP

#### Changeup, Splitter

In [15]:
features_os_r = rhp_offspeed.select_dtypes([np.number])
X = features_os_r.drop(columns = ['re24'])
y = features_os_r['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y)

params = {
    'n_estimators': [25, 75],
    'max_depth': [None, 5, 6, 7, 8],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [2, 5, 10],
    'n_jobs': [-1]
}
    
gs = GridSearchCV(RandomForestRegressor(random_state = 10), param_grid = params)
gs.fit(X_train, y_train)
os_pred_r = gs.predict(X_test)

print('Cross val score:', round((gs.best_score_), 4))
print('MSE:', round(metrics.mean_squared_error(y_test, os_pred_r), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, os_pred_r)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, os_pred_r), 4))
gs.best_params_

Cross val score: 0.0041
MSE: 0.1849
RMSE: 0.43
MAE: 0.1659


{'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 75,
 'n_jobs': -1}

## Offspeed LHP

#### Changeup, Splitter

In [16]:
features_os_l = lhp_offspeed.select_dtypes([np.number])
X = features_os_l.drop(columns = ['re24'])
y = features_os_l['re24']

X_train, X_test, y_train, y_test = train_test_split(X, y)

params = {
    'n_estimators': [25, 75],
    'max_depth': [None, 5, 6, 7, 8],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [2, 5, 10],
    'n_jobs': [-1]
}
    
gs = GridSearchCV(RandomForestRegressor(random_state = 10), param_grid = params)
gs.fit(X_train, y_train)
os_pred_l = gs.predict(X_test)

print('Cross val score:', round((gs.best_score_), 4))
print('MSE:', round(metrics.mean_squared_error(y_test, os_pred_l), 4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, os_pred_l)), 4))
print('MAE:', round(metrics.mean_absolute_error(y_test, os_pred_l), 4))
gs.best_params_

Cross val score: 0.0002
MSE: 0.1821
RMSE: 0.4268
MAE: 0.1721


{'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 75,
 'n_jobs': -1}