In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [69]:
df = pd.read_csv('../data/comb_clean_pitcher.csv', index_col=False)

In [53]:
df.head()

Unnamed: 0,player_name,pitcher,batter,stand,pitch_type,pitch_number,outs_when_up,times_faced,XBH,large_score_dif,recent_pitch,second_recent_pitch,third_recent_pitch,pitch_count
0,"Rodriguez, Grayson",680570,543760,R,FB,4,0,1,0,0,14.0 - SL - ball,2.0 - FB - foul,14.0 - FB - ball,2-1
1,"Rodriguez, Grayson",680570,543760,R,SL,5,0,1,0,0,9.0 - FB - called,14.0 - SL - ball,2.0 - FB - foul,2-2
2,"Rodriguez, Grayson",680570,543760,R,FB,6,0,1,0,0,14.0 - SL - ball,9.0 - FB - called,14.0 - SL - ball,3-2
3,"Rodriguez, Grayson",680570,608369,L,FB,4,0,1,0,0,14.0 - CH - ball,14.0 - CH - ball,4.0 - FB - called,2-1
4,"Rodriguez, Grayson",680570,608369,L,CH,5,0,1,0,0,3.0 - FB - foul,14.0 - CH - ball,14.0 - CH - ball,2-2


In [54]:
df.dropna(axis=0, inplace=True)

In [56]:
X = df.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'pitch_number'])
y = df['pitch_type']

categorical_cols = ['stand', 'outs_when_up', 'times_faced', 'XBH',
       'large_score_dif', 'recent_pitch', 'second_recent_pitch',
       'third_recent_pitch', 'pitch_count']

In [57]:
X.columns

Index(['stand', 'outs_when_up', 'times_faced', 'XBH', 'large_score_dif',
       'recent_pitch', 'second_recent_pitch', 'third_recent_pitch',
       'pitch_count'],
      dtype='object')

In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

In [59]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first')

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [60]:
dc = DummyClassifier(strategy='most_frequent')

In [61]:
dc.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [62]:
dc.score(X_test, y_test)

0.5474821397456002

In [66]:
# Create a column transformer to apply encoding only to the categorical columns
preprocessor = ColumnTransformer(transformers=
                                 [('cat', ohe, [0, 1, 2, 3, 4, 5, 6, 7, 8])],
                                 remainder='passthrough')

In [70]:
preprocessor.fit(X_train)

preprocessor.transform(X_train).shape

ValueError: `handle_unknown` must be 'error' when the drop parameter is specified, as both would create categories that are all zero.

In [68]:
preprocessor.transform(X_test).shape

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

In [32]:
pipe = Pipeline([('ct', preprocessor), ('dt', DecisionTreeClassifier())])

In [33]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

ValueError: `handle_unknown` must be 'error' when the drop parameter is specified, as both would create categories that are all zero.

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(pipe, X_train, y_train)

In [None]:
grid = {'dt__max_depth': [None, 20, 40, 60, 80, 100]}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gs_pipe = GridSearchCV(estimator=pipe, param_grid=grid, verbose=2)
gs_pipe.fit(X_train, y_train)

In [None]:
gs_pipe.best_params_

In [None]:
gs_pipe.best_score_

In [None]:
gs_pipe.best_estimator_.score(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_pipe = Pipeline([('ct', preprocessor), ('rf', RandomForestClassifier())])

In [None]:
rf_pipe.fit(X_train, y_train)
rf_pipe.score(X_train, y_train)

In [None]:
cross_val_score(rf_pipe, X_train, y_train)