In [101]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [102]:
df = pd.read_csv('../data/comb_clean_pitcher.csv', index_col=False)

In [103]:
df.head()

Unnamed: 0.1,Unnamed: 0,player_name,pitcher,batter,stand,pitch_type,pitch_number,outs_when_up,times_faced,XBH,large_score_dif,recent_pitch,second_recent_pitch,third_recent_pitch,pitch_count
0,3,"Rodriguez, Grayson",680570,543760,R,FB,4,0,1,0,0,14.0 - SL - ball,2.0 - FB - foul,14.0 - FB - ball,2-1
1,4,"Rodriguez, Grayson",680570,543760,R,SL,5,0,1,0,0,9.0 - FB - called,14.0 - SL - ball,2.0 - FB - foul,2-2
2,5,"Rodriguez, Grayson",680570,543760,R,FB,6,0,1,0,0,14.0 - SL - ball,9.0 - FB - called,14.0 - SL - ball,3-2
3,9,"Rodriguez, Grayson",680570,608369,L,FB,4,0,1,0,0,14.0 - CH - ball,14.0 - CH - ball,4.0 - FB - called,2-1
4,10,"Rodriguez, Grayson",680570,608369,L,CH,5,0,1,0,0,3.0 - FB - foul,14.0 - CH - ball,14.0 - CH - ball,2-2


In [104]:
df_clean = df.drop(df[df['pitch_count']== '4-2'].index) 



In [105]:
df_clean.dropna(axis=0, inplace=True)

In [106]:
X = df_clean.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'pitch_number'])
y = df_clean['pitch_type']

categorical_cols = ['stand', 'outs_when_up', 'times_faced', 'XBH',
       'large_score_dif', 'recent_pitch', 'second_recent_pitch',
       'third_recent_pitch', 'pitch_count']

In [107]:
X.columns

Index(['Unnamed: 0', 'stand', 'outs_when_up', 'times_faced', 'XBH',
       'large_score_dif', 'recent_pitch', 'second_recent_pitch',
       'third_recent_pitch', 'pitch_count'],
      dtype='object')

In [108]:
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

In [109]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='error', drop='first')

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [110]:
dc = DummyClassifier(strategy='most_frequent')

In [111]:
dc.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [112]:
dc.score(X_test, y_test)

0.540991461927165

In [113]:
# Create a column transformer to apply encoding only to the categorical columns
preprocessor = ColumnTransformer(transformers=
                                 [('cat', ohe, categorical_cols)],
                                 remainder='passthrough')

In [114]:
preprocessor.fit(X_train)

preprocessor.transform(X_train).shape

(68866, 628)

In [115]:
preprocessor.transform(X_test).shape

(22956, 628)

In [116]:
pipe = Pipeline([('ct', preprocessor), ('dt', DecisionTreeClassifier())])

In [17]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

1.0

In [117]:
from sklearn.model_selection import cross_val_score

In [19]:
cross_val_score(pipe, X_train, y_train)

array([0.49266734, 0.49067015, 0.49037973, 0.49175924, 0.48841937])

In [20]:
grid = {'dt__max_depth': [None, 20, 40, 60, 80, 100]}

In [21]:
from sklearn.model_selection import GridSearchCV

In [37]:
gs_pipe = GridSearchCV(estimator=pipe, param_grid=grid, verbose=2, n_jobs=4)
gs_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  1.1min finished


GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         ['stand',
                                                                          'outs_when_up',
                                                                          'times_faced',
                                                                          'XBH',
                                                                          'large_score_dif',
                                                                          'recent_pitch',
                                                              

In [23]:
gs_pipe.best_params_

{'dt__max_depth': 20}

In [24]:
gs_pipe.best_score_

0.5332530452302229

In [25]:
gs_pipe.best_estimator_.score(X_train, y_train)

0.7307089129614033

grid = {'dt**max_depth': [None, 20, 40, 60, 80, 100],
'dt**min_samples_split': [2, 3, 4, 5],
'dt\_\_min_samples_leaf': [1, 2, 3, 4],
'dt_min_impurity_decrease': [0, 1, 2, 3, 4]}


In [118]:
from sklearn.ensemble import RandomForestClassifier

In [119]:
rf_pipe = Pipeline([('ct', preprocessor), ('rf', RandomForestClassifier())])

In [28]:
rf_pipe.fit(X_train, y_train)
rf_pipe.score(X_train, y_train)

1.0

In [29]:
cross_val_score(rf_pipe, X_train, y_train)

array([0.54065631, 0.54352719, 0.546141  , 0.54476149, 0.5424381 ])

In [31]:
from sklearn.ensemble import GradientBoostingClassifier


In [120]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=.1, max_depth=3)

In [35]:
gb_pipe=Pipeline([('ct', preprocessor), ('gb', GradientBoostingClassifier())])

In [36]:
gb_pipe.fit(X_train,y_train)
gb_pipe.score(X_train, y_train)

0.5578369587314495

In [44]:
grid_gb = {'gb__learning_rate': [0.001,]}

In [45]:
grid_gb = GridSearchCV(estimator=gb_pipe, param_grid=grid_gb, verbose=2, n_jobs=4)
gb_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['stand', 'outs_when_up',
                                                   'times_faced', 'XBH',
                                                   'large_score_dif',
                                                   'recent_pitch',
                                                   'second_recent_pitch',
                                                   'third_recent_pitch',
                                                   'pitch_count'])])),
                ('gb', GradientBoostingClassifier())])

In [46]:
cross_val_score(gb_pipe, X_train, y_train)

array([0.55089299, 0.55013432, 0.55165904, 0.55173165, 0.55194947])

In [47]:
from sklearn.model_selection import RandomizedSearchCV

In [48]:
param_dist = {
    'gb__n_estimators': np.arange(50, 150),  # number of trees, e.g., 50 to 150
    'gb__learning_rate': [0.001, 0.01, 0.1, 1],  # learning rate
    'gb__max_depth': np.arange(3, 10),  # depth of trees
    'gb__min_samples_split': np.arange(2, 10),  # minimum number of samples required to split a node
    'gb__min_samples_leaf': np.arange(1, 10),  # minimum number of samples required at a leaf node
}


In [49]:

random_search = RandomizedSearchCV(estimator=gb_pipe,
                                   param_distributions=param_dist,
                                   n_iter=10,  
                                   cv=5,  
                                   verbose=2,  
                                   random_state=42, 
                                   n_jobs=4)  


In [50]:
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 206.1min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed: 333.7min finished


KeyboardInterrupt: 

In [51]:
print("Best Parameters: ", random_search.best_params_)
print("Best Score: ", random_search.best_score_)


Best Parameters:  {'gb__n_estimators': 117, 'gb__min_samples_split': 6, 'gb__min_samples_leaf': 4, 'gb__max_depth': 9, 'gb__learning_rate': 0.1}
Best Score:  0.5617430672956669


In [122]:
def transform_pitch_types_in_string(column):
    """
    Transform specific substrings in a column to 'OS'.
    Substrings 'SL', 'CH', 'CB' are changed to 'OS', other parts of the string are left unchanged.

    Args:
    column (pd.Series): A pandas Series representing the column to be transformed.

    Returns:
    pd.Series: The transformed column.
    """
    def replace_substring(pitch):
        # Replace specific substrings with 'OS'
        for sub in ['SL', 'CH', 'CB']:
            pitch = pitch.replace(sub, 'OS')
        return pitch

    return column.apply(replace_substring)

In [123]:
df_bi = df_clean
df_bi['pitch_type'] = transform_pitch_types(df_bi['pitch_type'])
df_bi['recent_pitch'] = transform_pitch_types_in_string(df_bi['recent_pitch'])
df_bi['second_recent_pitch'] = transform_pitch_types_in_string(df_bi['second_recent_pitch'])
df_bi['third_recent_pitch'] = transform_pitch_types_in_string(df_bi['third_recent_pitch'])

In [124]:
X_bi = df_bi.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'pitch_number'])
y_bi = df_bi['pitch_type']

In [125]:
# Split the encoded data into training and testing sets
X_train_bi, X_test_bi, y_train_bi, y_test_bi = train_test_split(X_bi, y_bi, random_state=42)

In [126]:
ohe_bi = OneHotEncoder(sparse=False, handle_unknown='error', drop='first')

In [127]:
preprocessor_bi = ColumnTransformer(transformers=
                                 [('cat', ohe_bi, [0, 1, 2, 3, 4, 5, 6, 7, 8])],
                                 remainder='passthrough')

X_train_transformed = preprocessor_bi.fit_transform(X_train_bi)

gb_pipe=Pipeline([('ct', preprocessor), ('gb', GradientBoostingClassifier())])

param_dist = {
    'gb__n_estimators': sp_randint(100, 1000),  # number of trees in the forest
    'gb__max_depth': [None] + list(range(5, 50)),  # maximum depth of the tree
    'gb__min_samples_split': sp_randint(2, 11),  # minimum number of samples required to split an internal node
    'gb__min_samples_leaf': sp_randint(1, 11)  # minimum number of samples required to be at a leaf node
}


gb_random_search = RandomizedSearchCV(estimator=gb_pipe,
                                     param_distributions=param_dist,
                                     n_liter=10,
                                     cv=5,
                                     verbose=2,
                                     random_search=42,
                                     n_jobs=4)
gb_random_search.fit(X_train_bi,y_train_bi)



MemoryError: Unable to allocate 35.5 GiB for an array with shape (68866, 69192) and data type float64

In [92]:
gb_pipe.fit(X_train_transformed, y_train_bi)


(91822, 15)

In [60]:
random_search.fit(X_train_transformed, y_train_bi)

KeyboardInterrupt: 

In [None]:
#this was taking to long and we had to switch another in gradientboostbi notebook