<a href="https://colab.research.google.com/github/a-chow3/STAT4996/blob/main/UVA_Baseball_Prediction_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# UVA Prediction Models

In [1]:
pwd

'/content'

In [2]:
# Change Working Directory
import os

os.chdir('/content/drive/MyDrive')

In [3]:
pwd

'/content/drive/MyDrive'

In [4]:
import pandas as pd

# Read in File
uva_df = pd.read_csv("filtered_uva_games.csv")
uva_df['TaggedPitchType'] = uva_df['TaggedPitchType'].str.replace('ChangeUp', 'Changeup')

# Display the first 5 rows of the DataFrame
uva_df.head()

Unnamed: 0,PitchNo,Date,Time,PAofInning,PitchofPA,Pitcher,PitcherId,PitcherThrows,PitcherTeam,Batter,...,ThrowTrajectoryZc1,ThrowTrajectoryZc2,PitchReleaseConfidence,PitchLocationConfidence,PitchMovementConfidence,HitLaunchConfidence,HitLandingConfidence,CatcherThrowCatchConfidence,CatcherThrowReleaseConfidence,CatcherThrowLocationConfidence
0,1,2023-02-17,14:08:14,1,1,"Edgington, Brian",1000027000.0,Right,VIR_CAV,"Burch, Nick",...,,,High,High,High,,,,,
1,2,2023-02-17,14:08:35,1,2,"Edgington, Brian",1000027000.0,Right,VIR_CAV,"Burch, Nick",...,,,High,High,High,,,,,
2,3,2023-02-17,14:08:55,1,3,"Edgington, Brian",1000027000.0,Right,VIR_CAV,"Burch, Nick",...,,,High,High,High,,,,,
3,4,2023-02-17,14:09:14,1,4,"Edgington, Brian",1000027000.0,Right,VIR_CAV,"Burch, Nick",...,,,High,High,High,,,,,
4,5,2023-02-17,14:09:35,1,5,"Edgington, Brian",1000027000.0,Right,VIR_CAV,"Burch, Nick",...,,,High,High,High,High,High,,,


## Predict Pitch Thrown per Count
Decision Tree

In [5]:
import numpy as np

# On-base events
uva_df['OBP'] = np.where(
    (uva_df['PlayResult'].isin(['Single', 'Double', 'Triple', 'HomeRun'])) |
    (uva_df['PitchCall'] == 'HitByPitch') |
    (uva_df['PitchCall'] == 'BallCalled') & (uva_df['Balls'] == 3),
    1, 0
)


# Total bases
uva_df['TB'] = np.select(
    [
        uva_df['PlayResult'].isin(['Undefined', 'Out', 'Error, FieldersChoice',
                                   'Sacrifice', 'StolenBase', 'CaughtStealing']),
        uva_df['PlayResult'] == 'Single',
        uva_df['PlayResult'] == 'Double',
        uva_df['PlayResult'] == 'Triple',
        uva_df['PlayResult'] == 'HomeRun'
    ],
    [0, 1, 2, 3, 4],
    default=0
)

# Now calculate RC
uva_df['RC'] = uva_df['OBP'] * uva_df['TB']

### Predicting RC

In [6]:
# Create differnet Subsets based on different Matchups and Pitch Types

# Define unique values for each categorical column
pitcher_throws_values = uva_df['PitcherThrows'].unique()
batter_side_values = uva_df['BatterSide'].unique()
tagged_pitch_type_values = uva_df['TaggedPitchType'].unique()

# Create an empty dictionary to store the subset DataFrames
subset_dfs = {}

# Iterate through all combinations of categorical values
for pitcher_throws in pitcher_throws_values:
    for batter_side in batter_side_values:
        for tagged_pitch_type in tagged_pitch_type_values:
            # Create a DataFrame name based on the combination
            df_name = f"{pitcher_throws[0]}{batter_side[0]}_{tagged_pitch_type}"

            # Filter the original DataFrame to create the subset
            subset_dfs[df_name] = uva_df[
                (uva_df['PitcherThrows'] == pitcher_throws) &
                (uva_df['BatterSide'] == batter_side) &
                (uva_df['TaggedPitchType'] == tagged_pitch_type)
            ]


# Access the subset DataFrames using their names
# Example: RonR_FB = subset_dfs['RonR_FB']

In [7]:
subset_dfs.keys()

dict_keys(['RR_Fastball', 'RR_Slider', 'RR_Splitter', 'RR_Changeup', 'RR_Curveball', 'RR_Undefined', 'RR_Cutter', 'RR_Sinker', 'RL_Fastball', 'RL_Slider', 'RL_Splitter', 'RL_Changeup', 'RL_Curveball', 'RL_Undefined', 'RL_Cutter', 'RL_Sinker', 'LR_Fastball', 'LR_Slider', 'LR_Splitter', 'LR_Changeup', 'LR_Curveball', 'LR_Undefined', 'LR_Cutter', 'LR_Sinker', 'LL_Fastball', 'LL_Slider', 'LL_Splitter', 'LL_Changeup', 'LL_Curveball', 'LL_Undefined', 'LL_Cutter', 'LL_Sinker'])

In [8]:
# Assuming subset_dfs is your dictionary containing these DataFrames

pitch_types = ['Fastball', 'Slider', 'Splitter', 'Changeup', 'Curveball', 'Undefined', 'Cutter', 'Sinker']
handedness = ['RR', 'RL', 'LR', 'LL']

for hand in handedness:
    for pitch in pitch_types:
        key = f"{hand}_{pitch}"
        if key in subset_dfs:
            shape = subset_dfs[key].shape
            print(f"Shape of {key}: {shape}")
        else:
            print(f"{key} not found in the dictionary")

Shape of RR_Fastball: (435, 170)
Shape of RR_Slider: (262, 170)
Shape of RR_Splitter: (26, 170)
Shape of RR_Changeup: (56, 170)
Shape of RR_Curveball: (87, 170)
Shape of RR_Undefined: (73, 170)
Shape of RR_Cutter: (32, 170)
Shape of RR_Sinker: (53, 170)
Shape of RL_Fastball: (359, 170)
Shape of RL_Slider: (58, 170)
Shape of RL_Splitter: (19, 170)
Shape of RL_Changeup: (177, 170)
Shape of RL_Curveball: (30, 170)
Shape of RL_Undefined: (58, 170)
Shape of RL_Cutter: (0, 170)
Shape of RL_Sinker: (62, 170)
Shape of LR_Fastball: (230, 170)
Shape of LR_Slider: (19, 170)
Shape of LR_Splitter: (0, 170)
Shape of LR_Changeup: (108, 170)
Shape of LR_Curveball: (20, 170)
Shape of LR_Undefined: (32, 170)
Shape of LR_Cutter: (0, 170)
Shape of LR_Sinker: (0, 170)
Shape of LL_Fastball: (65, 170)
Shape of LL_Slider: (54, 170)
Shape of LL_Splitter: (0, 170)
Shape of LL_Changeup: (1, 170)
Shape of LL_Curveball: (18, 170)
Shape of LL_Undefined: (15, 170)
Shape of LL_Cutter: (0, 170)
Shape of LL_Sinker: (0,

In [10]:
# Cut down the List of subset_dfs to only include dfs with 15 or more rows
subset_dfs = {key: df for key, df in subset_dfs.items() if df.shape[0] >= 15}

# Cut down RR_Undefined, RL_Undefined, LR_Undefined, LL_Undefined
for key in ['RR_Undefined', 'RL_Undefined', 'LR_Undefined', 'LL_Undefined']:
    if key in subset_dfs:
        del subset_dfs[key]

In [11]:
# Cut down columns to only include numeric columns for analysis
for df_name, df in subset_dfs.items():
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    subset_dfs[df_name] = df[numeric_cols]

In [12]:
# Keep only Selected Columns
desired_columns = [
    'Balls', 'Strikes', 'RelSpeed', 'VertRelAngle', 'SpinRate', 'SpinAxis',
    'RelHeight', 'RelSide', 'Extension', 'InducedVertBreak', 'HorzBreak',
    'PlateLocHeight', 'PlateLocSide', 'VertApprAngle', 'HorzApprAngle',
    'ZoneTime', 'SpeedDrop', 'RC'
]

for df_name, df in subset_dfs.items():
    # Select only the desired columns that exist in the DataFrame
    existing_columns = [col for col in desired_columns if col in df.columns]

    # Keep only these columns in the DataFrame
    subset_dfs[df_name] = df[existing_columns]

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Iterate through subset DataFrames
for df_name, df in subset_dfs.items():
    # Check if the DataFrame is not empty
    if not df.empty:

        # Prepare data for the model
        X = df.drop('RC', axis=1)  # Features (all columns except RC)
        y = df['RC']  # Target variable (RC)

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Create and train the Random Forest model
        model = RandomForestRegressor(random_state=42)
        model.fit(X_train, y_train)

        # Make predictions on the testing set
        y_pred = model.predict(X_test)

        # Evaluate the model's performance
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': model.feature_importances_})
        feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

        print(f"Results for {df_name}:")
        print(f"Mean Squared Error: {mse}")
        print(f"R-squared: {r2}")
        print(f"Important Features: {feature_importance}")
        print("-" * 20)  # Print a separator between results
    else:
        print(f"DataFrame {df_name} is empty. Skipping...")

Results for RR_Fastball:
Mean Squared Error: 0.38867126436781607
R-squared: -0.2835308900523559
Important Features:              Feature  Importance
2           RelSpeed    0.132766
15          ZoneTime    0.116706
14     HorzApprAngle    0.102391
13     VertApprAngle    0.079319
16         SpeedDrop    0.072241
9   InducedVertBreak    0.065809
6          RelHeight    0.059911
3       VertRelAngle    0.059870
12      PlateLocSide    0.058442
8          Extension    0.055816
5           SpinAxis    0.038714
7            RelSide    0.030886
4           SpinRate    0.030244
11    PlateLocHeight    0.030106
10         HorzBreak    0.024983
0              Balls    0.022382
1            Strikes    0.019412
--------------------
Results for RR_Slider:
Mean Squared Error: 0.664732075471698
R-squared: -0.07683529411764667
Important Features:              Feature  Importance
9   InducedVertBreak    0.227658
6          RelHeight    0.188262
4           SpinRate    0.159912
14     HorzApprAngle    