In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/comb_clean_pitcher.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,player_name,pitcher,batter,stand,pitch_type,pitch_number,outs_when_up,times_faced,XBH,large_score_dif,recent_pitch,second_recent_pitch,third_recent_pitch,pitch_count
0,"Rodriguez, Grayson",680570,543760,R,FB,4,0,1,0,0,14.0 - SL - ball,2.0 - FB - foul,14.0 - FB - ball,2-1
1,"Rodriguez, Grayson",680570,543760,R,SL,5,0,1,0,0,9.0 - FB - called,14.0 - SL - ball,2.0 - FB - foul,2-2
2,"Rodriguez, Grayson",680570,543760,R,FB,6,0,1,0,0,14.0 - SL - ball,9.0 - FB - called,14.0 - SL - ball,3-2
3,"Rodriguez, Grayson",680570,608369,L,FB,4,0,1,0,0,14.0 - CH - ball,14.0 - CH - ball,4.0 - FB - called,2-1
4,"Rodriguez, Grayson",680570,608369,L,CH,5,0,1,0,0,3.0 - FB - foul,14.0 - CH - ball,14.0 - CH - ball,2-2


In [4]:
df_clean = df.drop(df[df['pitch_count']== '4-2'].index)

In [5]:
df_clean.dropna(axis=0, inplace=True)

In [6]:
X = df_clean.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'pitch_number'])
y = df_clean['pitch_type']

categorical_cols = ['stand', 'outs_when_up', 'times_faced', 'XBH',
       'large_score_dif', 'recent_pitch', 'second_recent_pitch',
       'third_recent_pitch', 'pitch_count']

In [7]:
X.columns

Index(['stand', 'outs_when_up', 'times_faced', 'XBH', 'large_score_dif',
       'recent_pitch', 'second_recent_pitch', 'third_recent_pitch',
       'pitch_count'],
      dtype='object')

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

In [9]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='error', drop='first')

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
dc = DummyClassifier(strategy='most_frequent')

In [11]:
dc.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [12]:
dc.score(X_test, y_test)

0.540991461927165

In [13]:
# Create a column transformer to apply encoding only to the categorical columns
preprocessor = ColumnTransformer(transformers=
                                 [('cat', ohe, [0, 1, 2, 3, 4, 5, 6, 7, 8])],
                                 remainder='passthrough')

In [14]:
preprocessor.fit(X_train)

preprocessor.transform(X_train).shape

(68866, 627)

In [15]:
preprocessor.transform(X_test).shape

(22956, 627)

In [16]:
pipe = Pipeline([('ct', preprocessor), ('dt', DecisionTreeClassifier())])

In [17]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9954694624342927

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
cross_val_score(pipe, X_train, y_train)

array([0.44090315, 0.44442024, 0.44601757, 0.44507369, 0.43948305])

In [20]:
grid = {'dt__max_depth': [None, 20, 40, 60, 80, 100]}

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
gs_pipe = GridSearchCV(estimator=pipe, param_grid=grid, verbose=2, n_jobs=6)
gs_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  30 out of  30 | elapsed:   26.1s finished


GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         [0, 1,
                                                                          2, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8])])),
                                       ('dt', DecisionTreeClassifier())]),
             n_jobs=6,
             param_grid={'dt__max_depth': [None, 20, 40, 60, 80, 100]},
             ver

In [23]:
gs_pipe.best_params_

{'dt__max_depth': 20}

In [24]:
gs_pipe.best_score_

0.5375076515752507

In [25]:
gs_pipe.best_estimator_.score(X_train, y_train)

0.5807945865884472

grid = {'dt__max_depth': [None, 20, 40, 60, 80, 100], 
        'dt__min_samples_split': [2, 3, 4, 5], 
        'dt__min_samples_leaf': [1, 2, 3, 4],
        'dt_min_impurity_decrease': [0, 1, 2, 3, 4]}

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rf_pipe = Pipeline([('ct', preprocessor), ('rf', RandomForestClassifier())])

In [28]:
rf_pipe.fit(X_train, y_train)
rf_pipe.score(X_train, y_train)

0.9954694624342927

In [29]:
cross_val_score(rf_pipe, X_train, y_train)

array([0.52896762, 0.52552095, 0.52893342, 0.53256371, 0.53053075])

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [37]:
gb_pipe = Pipeline([('ct', preprocessor), ('gbc', GradientBoostingClassifier())])

In [38]:
grid_gb = {
    'gbc__n_estimators': [100, 200, 300],
    'gbc__learning_rate': [0.01, 0.1, 0.2],
    'gbc__max_depth': [3, 4, 5]
}

In [39]:
grid_search_pipe = GridSearchCV(estimator=gb_pipe, param_grid=grid_gb, verbose=2, n_jobs=6)
grid_search_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed: 54.1min
[Parallel(n_jobs=6)]: Done 135 out of 135 | elapsed: 267.3min finished


GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         [0, 1,
                                                                          2, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8])])),
                                       ('gbc', GradientBoostingClassifier())]),
             n_jobs=6,
             param_grid={'gbc__learning_rate': [0.01, 0.1, 0.2],
                  

In [43]:
results = grid_search_pipe.cv_results_

In [44]:
results

{'mean_fit_time': array([ 262.50231962,  526.40781307,  789.8827508 ,  344.54541221,
         690.97784553, 1034.29715147,  421.51268373,  849.33186588,
        1307.11343713,  265.86264687,  537.16967072,  805.4545989 ,
         354.55788903,  716.26918664, 1065.55926161,  441.76349034,
         878.55363035, 1324.76487474,  267.69245038,  533.38308501,
         812.33494215,  357.61126924,  710.48937249, 1061.68079839,
         437.93479967,  882.16453881, 1220.09946322]),
 'std_fit_time': array([ 2.37804454,  5.49145341,  8.8925241 ,  5.32628709,  8.22750433,
        12.66732666,  6.82551891,  8.86458489, 11.42547127,  1.59403068,
         5.62579845,  9.94947851,  4.1897297 ,  4.78701948,  9.02397188,
         3.79517679,  7.87163372, 16.89576473,  3.04501727,  6.30818803,
         5.82376864,  4.62673436,  6.98698975, 11.29517565,  4.63953782,
        10.34233445, 25.82420514]),
 'mean_score_time': array([0.11126685, 0.19634809, 0.30457501, 0.14538279, 0.26709614,
        0.415579

In [45]:
grid_search_pipe.best_score_

0.546873631952955

In [46]:
grid_search_pipe.best_params_

{'gbc__learning_rate': 0.1, 'gbc__max_depth': 3, 'gbc__n_estimators': 100}

In [47]:
grid_search_pipe.best_estimator_.score(X_train, y_train)

0.554017947898818

In [35]:
gbc = GradientBoostingClassifier()
parameters = gbc.get_params()

In [36]:
parameters

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [42]:
gb_pipe.fit(X_train, y_train)
gb_pipe.score(X_train, y_train)

KeyboardInterrupt: 