In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import csv

In [16]:
def clean_data(df): 
    #cleans the data for the given pitcher and returns the the accuracy of model for predicting pitch
    #define dependent and independent variables 
    factors = df[["stand", "balls", "strikes", "on_3b", "on_2b", "on_1b", "outs_when_up", "inning", "pitch_number", "bat_score", "fld_score",
              "if_fielding_alignment", "of_fielding_alignment", "p_throws"]]
    pitch = df[["pitch_type"]]
    
    #convert 1b, 2b, and 3b to boolean
    factors.loc[factors["on_1b"].notna(), "on_1b"] = True
    factors.loc[factors["on_1b"].isna(), "on_1b"] = False
    factors.loc[factors["on_2b"].notna(), "on_2b"] = True
    factors.loc[factors["on_2b"].isna(), "on_2b"] = False
    factors.loc[factors["on_3b"].notna(), "on_3b"] = True
    factors.loc[factors["on_3b"].isna(), "on_3b"] = False

    #see if batter is same stance that pitcher throws
    factors["same_stance"] = factors["stand"] == factors["p_throws"]
    del factors["stand"]
    del factors["p_throws"]

    #adds in a column for score differential
    factors["score_differential"] = factors["fld_score"] - factors["bat_score"]
    del factors["fld_score"]
    del factors["bat_score"]

    #Map all fastball variations in the "pitch_type" column to "FF"
    pitch_offspeed_grouped = pitch.copy()
    pitch['pitch_type'] = pitch['pitch_type'].replace(["SI", "4F", "2F", "FA", "FT", "FC", "FS", "SF"], 'FF')

    
    fastball_variations = ["SI", "4F", "2F", "FA", "FT", "FC", "FS", "SF"]
    #create a new pitch df where we map all offspeed pitches as one and all fastball variations as one
    pitch_offspeed_grouped['pitch_type'] = pitch_offspeed_grouped['pitch_type'].apply(lambda pitch: 'FF' if pitch in fastball_variations or pitch == 'FF' else 'OFF')

    #check infield shift
    factors["infieldShift"] = factors["if_fielding_alignment"] != "Standard"
    del factors["if_fielding_alignment"]

    #check outfield shift
    factors["outfieldShift"] = factors["of_fielding_alignment"] != "Standard"
    del factors["of_fielding_alignment"]

    #get unique list of pitchers pitches
    pitch_types = pitch["pitch_type"].unique()
    
    #build maps for pitches to ints and ints back to pitches
    pitch_map = {pitch_types[i]: i for i in range(len(pitch_types))}

    #map pitch types to ints
    pitch['pitch_type'] = pitch['pitch_type'].apply(lambda x: pitch_map[x])
    pitch_offspeed_grouped['pitch_type'] = pitch_offspeed_grouped['pitch_type'].replace({'FF': 0, 'OFF': 1})


    #split data into train and test sets for normal pitches
    X_train, X_test, Y_train, Y_test = train_test_split(factors, pitch, test_size = 0.2)
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, Y_train)
    accuracy_normal = clf.score(X_test, Y_test)

    #split data into train and test sets for offspeed pitches grouped
    X_train2, X_test2, Y_train2, Y_test2 = train_test_split(factors, pitch_offspeed_grouped, test_size = 0.2)
    clf2 = RandomForestClassifier(random_state=42)
    clf2.fit(X_train2, Y_train2)
    accuracy_offspeed_grouped = clf2.score(X_test2, Y_test2)

    return accuracy_normal, accuracy_offspeed_grouped



In [17]:
def most_frequent_pitch(df, name): #This method creates a bar chart for the pitcher's pitches, and also returns the frequency of the most used pitch
    #Create bar chart for pitches thrown
    pitch_counts = df.groupby("pitch_type")["pitch_type"].count()
    pitch_percentages = (100 * pitch_counts / pitch_counts.sum()).round(1)
    bar = pitch_percentages.plot(kind="bar")
    bar.set_title(f"Pitch Percentages for {name}")
    bar.set_xlabel("Pitch Type")
    bar.set_ylabel("Percentage")
    #plt.show()
    
    #combine all fastball variations to one pitch
    df['pitch_type'] = df['pitch_type'].replace(["SI", "4F", "2F", "FA", "FT", "FC", "FS", "SF"], 'FF')
    most_used_pitch = df['pitch_type'].value_counts().idxmax()
    
    #Calculate the frequency of the most used pitch
    frequency = df['pitch_type'].value_counts(normalize=True)[most_used_pitch]
    
    return frequency

In [18]:
pitchers = []
n_iterations = 100
df_master = pd.DataFrame(columns=["Pitcher Name", "Highest Pitch Frequency", "Model Accuracy Base", "Model Accuracy Offspeed Grouped"])
for pitcher in pitchers:
    name = pitcher
    df = pd.read_csv(f"{name}.csv")
    frequency = most_frequent_pitch(df, name)
    total_normal = 0
    total_grouped = 0
    for i in range(n_iterations): #Run the model n_iterations times for each pitcher to get an average accuracy to account for randomness
        accuracy_normal, accuracy_grouped = clean_data(df)
        total_normal = total_normal + accuracy_normal
        total_grouped = total_grouped + accuracy_grouped
    total_normal = total_normal / n_iterations
    total_grouped = total_grouped / n_iterations
    df_master = df_master.append({"Pitcher Name": name, "Highest Pitch Frequency": frequency * 100, "Model Accuracy Base": total_normal * 100, "Model Accuracy Offspeed Grouped": total_grouped * 100},
                                     ignore_index = True)
df_master.to_csv("Pitch_Predictor.csv", index=False, quoting=csv.QUOTE_NONE, encoding='utf-8')
#print(df_master)

In [23]:
plt.ioff()
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

#histogram for Model Accuracy Base and Highest Pitch Frequency
axes[0].hist([df_master['Model Accuracy Base'], df_master['Highest Pitch Frequency']], bins=10, alpha=0.7, color=['blue', 'orange'])
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Comparison of Model Accuracy Base and Highest Pitch Frequency')

#histogram for Highest Pitch Frequency and Model Accuracy Offspeed Grouped
axes[1].hist([df_master['Highest Pitch Frequency'], df_master['Model Accuracy Offspeed Grouped']], bins=10, alpha=0.7, color=['orange', 'green'])
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Comparison of Highest Pitch Frequency and Model Accuracy Offspeed Grouped')

plt.tight_layout()
#plt.show()




