In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('../data/comb_clean_pitcher.csv', index_col=False)

In [3]:
df = pd.read_csv('../data/clean_pitcher_data.csv', index_col=False)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,player_name,pitcher,batter,stand,pitch_type,pitch_number,outs_when_up,times_faced,XBH,large_score_dif,recent_pitch,second_recent_pitch,third_recent_pitch,pitch_count
0,3,"Rodriguez, Grayson",680570,543760,R,FB,4,0,1,0,0,14.0 - SL - ball,2.0 - FB - foul,14.0 - FB - ball,2-1
1,4,"Rodriguez, Grayson",680570,543760,R,SL,5,0,1,0,0,9.0 - FB - called,14.0 - SL - ball,2.0 - FB - foul,2-2
2,5,"Rodriguez, Grayson",680570,543760,R,FB,6,0,1,0,0,14.0 - SL - ball,9.0 - FB - called,14.0 - SL - ball,3-2
3,9,"Rodriguez, Grayson",680570,608369,L,FB,4,0,1,0,0,14.0 - CH - ball,14.0 - CH - ball,4.0 - FB - called,2-1
4,10,"Rodriguez, Grayson",680570,608369,L,CH,5,0,1,0,0,3.0 - FB - foul,14.0 - CH - ball,14.0 - CH - ball,2-2


In [5]:
df_cleaned = df.dropna(subset=['pitch_type'])

In [6]:
df_cleaned.isna().sum()

Unnamed: 0             0
player_name            0
pitcher                0
batter                 0
stand                  0
pitch_type             0
pitch_number           0
outs_when_up           0
times_faced            0
XBH                    0
large_score_dif        0
recent_pitch           0
second_recent_pitch    0
third_recent_pitch     0
pitch_count            0
dtype: int64

In [20]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

y = df_cleaned['pitch_type']
X = df_cleaned.drop(['pitch_type', 'Unnamed: 0', 'player_name'], axis=1)  # Dropping non-numeric and target columns

# Converting categorical variables to dummy/indicator variables
X = pd.get_dummies(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Creating a dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)

# Training the dummy classifier
dummy_clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = dummy_clf.predict(X_test)

# Evaluating the classifier
report = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print(report)

              precision    recall  f1-score   support

          CB       0.00      0.00      0.00      5237
          CH       0.00      0.00      0.00      6429
          FB       0.54      1.00      0.70     24933
          SL       0.00      0.00      0.00      9313

    accuracy                           0.54     45912
   macro avg       0.14      0.25      0.18     45912
weighted avg       0.29      0.54      0.38     45912



In [12]:
X = df_cleaned.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'Unnamed: 0'])
y = df_cleaned['pitch_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
from sklearn.model_selection import train_test_split

# Drop the specified columns and prepare features (X) and target (y)
X = df_cleaned.drop(columns=['player_name', 'pitcher', 'batter', 'pitch_type', 'Unnamed: 0'])
y = df_cleaned['pitch_type']

# Identify categorical columns in X
# (Assuming columns with object type as categorical)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Display categorical columns
categorical_cols


['stand',
 'recent_pitch',
 'second_recent_pitch',
 'third_recent_pitch',
 'pitch_count']

In [14]:
from sklearn.compose import ColumnTransformer

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a column transformer to apply encoding only to the categorical columns
preprocessor = ColumnTransformer(transformers=[
    ('cat', onehot_encoder, categorical_cols)
], remainder='passthrough',)

# Apply the transformations to X
X_encoded = preprocessor.fit_transform(X)

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42)

# Output the shapes of the resulting datasets as a check
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((68867, 627), (22956, 627), (68867,), (22956,))

In [15]:
# Convert the transformed X_encoded back to a DataFrame for easier inspection
# Need to get the feature names after transformation
encoded_feature_names = (preprocessor.named_transformers_['cat'].get_feature_names(categorical_cols).tolist() + 
                         X.drop(columns=categorical_cols).columns.tolist())

X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names)

# Display the first few rows of the transformed dataframe
X_encoded_df.head()

Unnamed: 0,stand_L,stand_R,recent_pitch_1.0 - CB - ball,recent_pitch_1.0 - CB - called,recent_pitch_1.0 - CB - foul,recent_pitch_1.0 - CH - ball,recent_pitch_1.0 - CH - called,recent_pitch_1.0 - CH - foul,recent_pitch_1.0 - CH - swinging,recent_pitch_1.0 - FB - ball,...,pitch_count_2-2,pitch_count_3-0,pitch_count_3-1,pitch_count_3-2,pitch_count_4-2,pitch_number,outs_when_up,times_faced,XBH,large_score_dif
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,6.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0


In [16]:
dt_classifier = DecisionTreeClassifier(random_state=42)


dt_classifier.fit(X_train, y_train)


accuracy = dt_classifier.score(X_test, y_test)

accuracy

0.4419759539989545