In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/comb_clean_pitcher.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,player_name,pitcher,batter,stand,pitch_type,pitch_number,outs_when_up,times_faced,XBH,large_score_dif,recent_pitch,second_recent_pitch,third_recent_pitch,pitch_count
0,"Rodriguez, Grayson",680570,543760,R,FB,4,0,1,0,0,14.0 - SL - ball,2.0 - FB - foul,14.0 - FB - ball,2-1
1,"Rodriguez, Grayson",680570,543760,R,SL,5,0,1,0,0,9.0 - FB - called,14.0 - SL - ball,2.0 - FB - foul,2-2
2,"Rodriguez, Grayson",680570,543760,R,FB,6,0,1,0,0,14.0 - SL - ball,9.0 - FB - called,14.0 - SL - ball,3-2
3,"Rodriguez, Grayson",680570,608369,L,FB,4,0,1,0,0,14.0 - CH - ball,14.0 - CH - ball,4.0 - FB - called,2-1
4,"Rodriguez, Grayson",680570,608369,L,CH,5,0,1,0,0,3.0 - FB - foul,14.0 - CH - ball,14.0 - CH - ball,2-2


In [4]:
df_clean = df.drop(df[df['pitch_count']== '4-2'].index)

In [5]:
df_clean.dropna(axis=0, inplace=True)

In [6]:
X = df_clean.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'pitch_number'])
y = df_clean['pitch_type']

categorical_cols = ['stand', 'outs_when_up', 'times_faced', 'XBH',
       'large_score_dif', 'recent_pitch', 'second_recent_pitch',
       'third_recent_pitch', 'pitch_count']

In [7]:
X.columns

Index(['stand', 'outs_when_up', 'times_faced', 'XBH', 'large_score_dif',
       'recent_pitch', 'second_recent_pitch', 'third_recent_pitch',
       'pitch_count'],
      dtype='object')

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

In [9]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='error', drop='first')

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
dc = DummyClassifier(strategy='most_frequent')

In [11]:
dc.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [12]:
dc.score(X_test, y_test)

0.540991461927165

In [13]:
# Create a column transformer to apply encoding only to the categorical columns
preprocessor = ColumnTransformer(transformers=
                                 [('cat', ohe, [0, 1, 2, 3, 4, 5, 6, 7, 8])],
                                 remainder='passthrough')

In [14]:
preprocessor.fit(X_train)

preprocessor.transform(X_train).shape

(68866, 627)

In [15]:
preprocessor.transform(X_test).shape

(22956, 627)

In [16]:
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights for the original unbalanced training data
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [17]:
class_weight_dict

{'CB': 2.2058295964125563,
 'CH': 1.7690608302507194,
 'FB': 0.4591311536615286,
 'SL': 1.2447762273154508}

In [None]:
# Initialize the Decision Tree Classifier with class weights
dt_classifier = DecisionTreeClassifier(class_weight=class_weight_dict, random_state=42)

# Fit the classifier to the training data
dt_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred_dt = dt_classifier.predict(X_test)

# Generate classification report
report_dt = classification_report(y_test, y_pred_dt)

report_dt

In [18]:
pipe = Pipeline([('ct', preprocessor), ('dt', DecisionTreeClassifier())])

In [19]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9954694624342927

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
cross_val_score(pipe, X_train, y_train)

array([0.43937854, 0.44347637, 0.44398461, 0.44202425, 0.4426051 ])

In [29]:
grid = {'dt__max_depth': [None, 20, 40, 60, 80],
       'dt__class_weight': [None, 'balanced', {'CB': 1.7515736766809729,
 'CH': 1.41,
 'FB': 0.368087185268696,
 'SL': 0.9970276872964169}]}

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
gs_pipe = GridSearchCV(estimator=pipe, param_grid=grid, verbose=2, n_jobs=6)
gs_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   27.5s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  1.1min finished


GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         [0, 1,
                                                                          2, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8])])),
                                       ('dt', DecisionTreeClassifier())]),
             n_jobs=6,
             param_grid={'dt__class_weight': [None, 'balanced',
                        

In [32]:
gs_pipe.best_params_

{'dt__class_weight': None, 'dt__max_depth': 20}

In [33]:
gs_pipe.best_score_

0.5374495764050622

In [34]:
gs_pipe.best_estimator_.score(X_train, y_train)

0.5806929399122934

grid = {'dt__max_depth': [None, 20, 40, 60, 80, 100], 
        'dt__min_samples_split': [2, 3, 4, 5], 
        'dt__min_samples_leaf': [1, 2, 3, 4],
        'dt_min_impurity_decrease': [0, 1, 2, 3, 4]}

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rf_pipe = Pipeline([('ct', preprocessor), ('rf', RandomForestClassifier())])

In [38]:
grid_search_rf = {'rf__max_depth': [None, 25, 35, 45, 60],
       'rf__class_weight': ['balanced', 'balanced_subsample', {'CB': 1.7515736766809729,
 'CH': 1.41,
 'FB': 0.368087185268696,
 'SL': 0.9970276872964169}]}

In [40]:
rf_pipe = GridSearchCV(estimator=rf_pipe, param_grid=grid_search_rf, verbose=3, n_jobs=6)
rf_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.0min
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  3.6min finished


GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         [0, 1,
                                                                          2, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8])])),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=6,
             param_grid={'rf__class_weight': ['balanced', 'balanced_subsample',
        

In [41]:
print(rf_pipe.best_params_)
print(rf_pipe.best_score_)
print(rf_pipe.best_estimator_.score(X_train, y_train))

{'rf__class_weight': 'balanced', 'rf__max_depth': None}
0.5278511635660333
0.9953968576656115


In [42]:
cross_val_score(rf_pipe, X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   45.0s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  2.7min finished


Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
    r = call_item()
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/s

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
    r = call_item()
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "/home/dreampy/anaconda3/envs/learn-env/lib/python3.8/site-packages/s

array([0.5262088 ,        nan, 0.52719088,        nan, 0.52610179])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb_pipe = Pipeline([('ct', preprocessor), ('gbc', GradientBoostingClassifier())])

In [None]:
grid_gb = {
    'gbc__n_estimators': [50, 100],
    'gbc__learning_rate': [.001, .01, 0.1, 0.2],
    'gbc__max_depth': [None, 3, 4, 5, 6, 7]
}

In [None]:
grid_search_pipe = GridSearchCV(estimator=gb_pipe, param_grid=grid_gb, verbose=3, n_jobs=8)
grid_search_pipe.fit(X_train, y_train)

In [None]:
results = grid_search_pipe.cv_results_

In [None]:
results

In [None]:
grid_search_pipe.best_score_

In [None]:
grid_search_pipe.best_params_

In [None]:
grid_search_pipe.best_estimator_.score(X_train, y_train)

In [None]:
gbc = GradientBoostingClassifier()
parameters = gbc.get_params()

In [None]:
parameters

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Loop over all columns in the dataframe to encode categorical columns
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        # Fit the label encoder on the training data
        le.fit(pd.concat([X_train[col], X_test[col]], axis=0, ignore_index=True))
        # Transform both training and test data
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

# Verify the transformation
X_train.head()


In [None]:
from sklearn.preprocessing import LabelEncoder

# Re-import LabelEncoder and continue with the process
le = LabelEncoder()

# Encode the target variable
y_encoded = le.fit_transform(y)

# Split the data into training and test sets again
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Compute class weights for the original unbalanced training data
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weight_dict = dict(zip(le.classes_, class_weights))

class_weight_dict