In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/comb_clean_pitcher.csv', index_col=False)

In [5]:
df.head()

Unnamed: 0,player_name,pitcher,batter,stand,pitch_type,pitch_number,outs_when_up,times_faced,XBH,large_score_dif,recent_pitch,second_recent_pitch,third_recent_pitch,pitch_count
0,"Rodriguez, Grayson",680570,543760,R,FB,4,0,1,0,0,14.0 - SL - ball,2.0 - FB - foul,14.0 - FB - ball,2-1
1,"Rodriguez, Grayson",680570,543760,R,SL,5,0,1,0,0,9.0 - FB - called,14.0 - SL - ball,2.0 - FB - foul,2-2
2,"Rodriguez, Grayson",680570,543760,R,FB,6,0,1,0,0,14.0 - SL - ball,9.0 - FB - called,14.0 - SL - ball,3-2
3,"Rodriguez, Grayson",680570,608369,L,FB,4,0,1,0,0,14.0 - CH - ball,14.0 - CH - ball,4.0 - FB - called,2-1
4,"Rodriguez, Grayson",680570,608369,L,CH,5,0,1,0,0,3.0 - FB - foul,14.0 - CH - ball,14.0 - CH - ball,2-2


In [6]:
df.dropna(axis=0, inplace=True)

In [9]:
X = df.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'pitch_number'])
y = df['pitch_type']

categorical_cols = ['stand', 'outs_when_up', 'times_faced', 'XBH',
       'large_score_dif', 'recent_pitch', 'second_recent_pitch',
       'third_recent_pitch', 'pitch_count']

In [15]:
X.columns

Index(['stand', 'outs_when_up', 'times_faced', 'XBH', 'large_score_dif',
       'recent_pitch', 'second_recent_pitch', 'third_recent_pitch',
       'pitch_count'],
      dtype='object')

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

In [17]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first')

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [27]:
dc = DummyClassifier(strategy='most_frequent')

In [28]:
dc.fit(X_train, y_train)

In [31]:
dc.score(X_test, y_test)

0.5474821397456002

In [None]:
# Create a column transformer to apply encoding only to the categorical columns
preprocessor = ColumnTransformer(transformers=
                                 [('cat', ohe, [0, 1, 2, 3, 4, 5, 6, 7, 8])],
                                 remainder='passthrough')

In [19]:
preprocessor.fit(X_train)

preprocessor.transform(X_train).shape



(68867, 628)

In [20]:
preprocessor.transform(X_test).shape

(22956, 628)

In [21]:
pipe = Pipeline([('ct', preprocessor), ('dt', DecisionTreeClassifier())])

In [23]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)



0.9953969245066577

In [24]:
from sklearn.model_selection import cross_val_score

In [26]:
cross_val_score(pipe, X_train, y_train)



array([0.44010454, 0.44271816, 0.44478327, 0.44187904, 0.44289552])

In [32]:
grid = {'dt__max_depth': [None, 20, 40, 60, 80, 100]}

In [33]:
from sklearn.model_selection import GridSearchCV

In [35]:
gs_pipe = GridSearchCV(estimator=pipe, param_grid=grid, verbose=2)
gs_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




[CV] END .................................dt__max_depth=None; total time=   8.1s




[CV] END .................................dt__max_depth=None; total time=   8.6s




[CV] END .................................dt__max_depth=None; total time=   8.7s




[CV] END .................................dt__max_depth=None; total time=   8.1s




[CV] END .................................dt__max_depth=None; total time=   8.6s
[CV] END ...................................dt__max_depth=20; total time=   3.2s




[CV] END ...................................dt__max_depth=20; total time=   3.4s




[CV] END ...................................dt__max_depth=20; total time=   3.4s




[CV] END ...................................dt__max_depth=20; total time=   3.2s




[CV] END ...................................dt__max_depth=20; total time=   3.7s
[CV] END ...................................dt__max_depth=40; total time=   5.0s




[CV] END ...................................dt__max_depth=40; total time=   5.5s




[CV] END ...................................dt__max_depth=40; total time=   5.4s




[CV] END ...................................dt__max_depth=40; total time=   4.9s




[CV] END ...................................dt__max_depth=40; total time=   5.1s
[CV] END ...................................dt__max_depth=60; total time=   6.1s




[CV] END ...................................dt__max_depth=60; total time=   6.3s




[CV] END ...................................dt__max_depth=60; total time=   7.0s




[CV] END ...................................dt__max_depth=60; total time=   6.1s




[CV] END ...................................dt__max_depth=60; total time=   6.5s
[CV] END ...................................dt__max_depth=80; total time=   6.9s




[CV] END ...................................dt__max_depth=80; total time=   7.0s




[CV] END ...................................dt__max_depth=80; total time=   7.2s




[CV] END ...................................dt__max_depth=80; total time=   6.6s




[CV] END ...................................dt__max_depth=80; total time=   7.0s
[CV] END ..................................dt__max_depth=100; total time=   7.2s




[CV] END ..................................dt__max_depth=100; total time=   7.6s




[CV] END ..................................dt__max_depth=100; total time=   7.7s




[CV] END ..................................dt__max_depth=100; total time=   7.2s




[CV] END ..................................dt__max_depth=100; total time=   7.5s


In [36]:
gs_pipe.best_params_

{'dt__max_depth': 20}

In [37]:
gs_pipe.best_score_

0.5360913140674567

In [40]:
gs_pipe.best_estimator_.score(X_train, y_train)

0.5745712750664324

grid = {'dt__max_depth': [None, 20, 40, 60, 80, 100], 
        'dt__min_samples_split': [2, 3, 4, 5], 
        'dt__min_samples_leaf': [1, 2, 3, 4],
        'dt_min_impurity_decrease': [0, 1, 2, 3, 4]}

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf_pipe = Pipeline([('ct', preprocessor), ('rf', RandomForestClassifier())])

In [43]:
rf_pipe.fit(X_train, y_train)
rf_pipe.score(X_train, y_train)



0.9953969245066577

In [44]:
cross_val_score(rf_pipe, X_train, y_train)



array([0.52468419, 0.52410338, 0.52646482, 0.52719088, 0.52109199])