In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pybaseball
import seaborn as sns



In [3]:
data = pd.read_csv('savant_data_2024.csv')
data = data.iloc[::-1].copy().reset_index(drop=True)

In [4]:
data[['game_date', 'player_name', 'pitch_type', 'inning', 'balls', 'strikes', 'home_team', 'away_team']]

Unnamed: 0,game_date,player_name,pitch_type,inning,balls,strikes,home_team,away_team
0,2024-03-20,"Darvish, Yu",FF,1,0,0,SD,LAD
1,2024-03-20,"Darvish, Yu",FF,1,1,0,SD,LAD
2,2024-03-20,"Darvish, Yu",SI,1,1,1,SD,LAD
3,2024-03-20,"Darvish, Yu",SI,1,2,1,SD,LAD
4,2024-03-20,"Darvish, Yu",FF,1,0,0,SD,LAD
...,...,...,...,...,...,...,...,...
730053,2024-09-30,"Díaz, Edwin",SL,9,0,0,ATL,NYM
730054,2024-09-30,"Díaz, Edwin",SL,9,0,1,ATL,NYM
730055,2024-09-30,"Díaz, Edwin",SL,9,1,1,ATL,NYM
730056,2024-09-30,"Díaz, Edwin",SL,9,1,2,ATL,NYM


In [6]:
non_numeric_cols = data.select_dtypes(exclude=['number']).columns
for col in non_numeric_cols:
    print(f"{col}: {data[col].dtype}")

pitch_type: object
game_date: object
player_name: object
events: object
description: object
des: object
game_type: object
stand: object
p_throws: object
home_team: object
away_team: object
type: object
bb_type: object
inning_topbot: object
pitch_name: object
if_fielding_alignment: object
of_fielding_alignment: object


In [7]:
filtered_data = data.drop(columns=['events', 'des', 'if_fielding_alignment', 'of_fielding_alignment', 'umpire', 'sv_id',
    'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'tfs_deprecated', 'tfs_zulu_deprecated',
    'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9',
    'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'age_pit_legacy', 'age_bat_legacy',
    'api_break_z_with_gravity', 'api_break_x_arm', 'api_break_x_batter_in', 'arm_angle', 'attack_angle', 'attack_direction', 
    'swing_path_tilt', 'intercept_ball_minus_batter_pos_x_inches', 'intercept_ball_minus_batter_pos_y_inches',
    'estimated_slg_using_speedangle', 'delta_pitcher_run_exp', 'hyper_speed', 'post_fld_score', 'game_type', 
    
    # following have the potential to be included to improve accuracy but requires further feature engineering
    
    'release_pos_x', 'release_pos_z', 'spin_dir', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'hc_x', 'hc_y', 
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'effective_speed', 'release_spin_rate', 'release_extension',
    'babip_value', 'iso_value', 'launch_speed_angle', 'post_away_score', 'post_home_score', 'post_bat_score',
    'delta_home_win_exp', 'delta_run_exp', 'bat_speed', 'swing_length', 'home_score_diff', 'bat_score_diff', 'home_win_exp', 
    'bat_win_exp', 'sz_top', 'sz_bot', 'woba_value', 'woba_denom', 'age_pit', 'age_bat', 'hit_distance_sc', 'spin_axis',

    'game_year'
    
])

In [9]:
data_more_filtered = filtered_data.copy()#.drop(columns=['pitch_name', 'home_team', 'away_team', 'player_name', 'game_date'])

In [10]:
print(data_more_filtered.select_dtypes(exclude=['number']).columns)

Index(['pitch_type', 'game_date', 'player_name', 'description', 'stand',
       'p_throws', 'home_team', 'away_team', 'type', 'bb_type',
       'inning_topbot', 'pitch_name'],
      dtype='object')


### Feature Engineering -- Previous Pitch Data

treat zone, hit_location, on_Xb as categorical data

In [11]:
# idea from https://seanjhannon.medium.com/baseball-pitch-prediction-with-deep-learning-df68094fcc65
data_more_filtered.loc[:,'plate_app_id'] =  data_more_filtered['game_pk'].astype(str) + data_more_filtered['batter'].astype(str) + data_more_filtered['at_bat_number'].astype(str)


In [12]:
# modifies data to boolean values for on_3b, on_2b, on_1b
for base_col in ['on_3b', 'on_2b', 'on_1b']:
    data_more_filtered[base_col] = data_more_filtered[base_col].apply(
        lambda x: 0 if (isinstance(x, (int, float)) and not pd.isnull(x) and x == 0)
        else (1 if isinstance(x, (int, float)) and not pd.isnull(x) else 0)
    )

data_more_filtered[['plate_app_id', 'on_3b', 'on_2b', 'on_1b']]

Unnamed: 0,plate_app_id,on_3b,on_2b,on_1b
0,7454446051411,0,0,0
1,7454446051411,0,0,0
2,7454446051411,0,0,0
3,7454446051411,0,0,0
4,7454446602712,0,0,1
...,...,...,...,...
730053,74713951859582,0,1,0
730054,74713951859582,0,1,0
730055,74713951859582,0,1,0
730056,74713951859582,0,1,0


In [13]:
# Check if 'stand' or 'p_throws' can change within the same plate_app_id
stand_varies_mask = data_more_filtered.groupby('plate_app_id')['stand'].nunique() > 1
p_throws_varies_mask = data_more_filtered.groupby('plate_app_id')['p_throws'].nunique() > 1

stand_varies_count = stand_varies_mask.sum()
p_throws_varies_count = p_throws_varies_mask.sum()

print(f"Number of plate_app_id where 'stand' varies: {stand_varies_count}")
print(f"Number of plate_app_id where 'p_throws' varies: {p_throws_varies_count}")

# Optionally, show some example plate_app_id values if they do vary
if stand_varies_count > 0:
    print("Example plate_app_id(s) where 'stand' varies:")
    print(stand_varies_mask[stand_varies_mask].index.tolist()[:5])
if p_throws_varies_count > 0:
    print("Example plate_app_id(s) where 'p_throws' varies:")
    print(p_throws_varies_mask[p_throws_varies_mask].index.tolist()[:5])

# varies too little to be useful as pre-pitch data for the following pitch


Number of plate_app_id where 'stand' varies: 6
Number of plate_app_id where 'p_throws' varies: 31
Example plate_app_id(s) where 'stand' varies:
['74479568308323', '74507867076430', '74570760807034', '74657259387156', '74783466613425']
Example plate_app_id(s) where 'p_throws' varies:
['74479568308323', '74489267639155', '7450075437604', '74502167127742', '74505168299875']


In [14]:
# Calculate the difference in away_score and home_score within each plate_app_id
away_score_diff = data_more_filtered.groupby('plate_app_id')['away_score'].diff().fillna(0)
home_score_diff = data_more_filtered.groupby('plate_app_id')['home_score'].diff().fillna(0)

# Count the number of times both away_score_diff and home_score_diff are greater than 0 at the same time
both_scores_increased = ((away_score_diff > 0) & (home_score_diff > 0)).sum()

print(f"Number of times both away_score and home_score increased: {both_scores_increased}")

# Create a new column 'prev_runs_scored' that is simply the sum of away_score_diff and home_score_diff
# since there is no data weirdness where both away and home score increase, we can add the two series
data_more_filtered['prev_runs_scored'] = away_score_diff + home_score_diff




Number of times both away_score and home_score increased: 0


In [16]:
# data to shift from previous pitch(es) in the same plate appearance
# i.e. not known pre-pitch
memory_features = ['pitch_type', 'description', 'zone', 'release_speed', 'type', 'hit_location', 'bb_type',
        'launch_speed', 'balls', 'strikes', 'launch_angle', 'release_pos_y', ]

# known pre-pitch
context_features = ['on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'at_bat_number', 'pitch_number',
                    'home_score', 'away_score', 'bat_score', 'fld_score', 'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat', 
                    'pitcher_days_since_prev_game', 'batter_days_since_prev_game', 'pitcher_days_until_next_game', 
                    'batter_days_until_next_game', 'prev_runs_scored']

In [17]:
data_more_filtered['n_priorpa_thisgame_player_at_bat'].value_counts()

n_priorpa_thisgame_player_at_bat
0    202067
1    179495
2    166912
3    139261
4     39596
5      2653
6        74
Name: count, dtype: int64

### Encoding Categorical Variables

In [18]:
for col in data_more_filtered.select_dtypes(exclude=['number']).columns:
    print(f"{col}: {len(data_more_filtered[col].unique())} unique values")

pitch_type: 18 unique values
game_date: 190 unique values
player_name: 1082 unique values
description: 13 unique values
stand: 2 unique values
p_throws: 2 unique values
home_team: 30 unique values
away_team: 30 unique values
type: 3 unique values
bb_type: 5 unique values
inning_topbot: 2 unique values
pitch_name: 18 unique values
plate_app_id: 189151 unique values


In [19]:
data_more_filtered['description'].value_counts()

description
ball                       242247
foul                       131539
hit_into_play              128483
called_strike              119469
swinging_strike             77876
blocked_ball                15265
foul_tip                     7508
swinging_strike_blocked      4036
hit_by_pitch                 2103
foul_bunt                    1255
missed_bunt                   206
pitchout                       55
bunt_foul_tip                  16
Name: count, dtype: int64

In [20]:
data_more_filtered.loc[
    data_more_filtered['description'].isin(['missed_bunt', 'bunt_foul_tip']),
    'description'
] = 'foul_bunt'

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Only encode the specified categorical columns, zone and hit_location are numbers but categorical
cols_to_encode = ['description', 'stand', 'p_throws', 'type', 'bb_type', 'inning_topbot', 'zone', 'hit_location']

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='if_binary')
encoded = encoder.fit_transform(data_more_filtered[cols_to_encode])

encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cols_to_encode), index=data_more_filtered.index)
filtered_data_encoded = pd.concat([data_more_filtered.drop(columns=cols_to_encode), encoded_df], axis=1)

In [32]:
filtered_data_encoded

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,home_team,away_team,balls,strikes,...,hit_location_1.0,hit_location_2.0,hit_location_3.0,hit_location_4.0,hit_location_5.0,hit_location_6.0,hit_location_7.0,hit_location_8.0,hit_location_9.0,hit_location_nan
0,FF,2024-03-20,94.5,"Darvish, Yu",605141,506433,SD,LAD,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,FF,2024-03-20,92.6,"Darvish, Yu",605141,506433,SD,LAD,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,SI,2024-03-20,93.4,"Darvish, Yu",605141,506433,SD,LAD,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,SI,2024-03-20,93.9,"Darvish, Yu",605141,506433,SD,LAD,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,FF,2024-03-20,95.7,"Darvish, Yu",660271,506433,SD,LAD,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730053,SL,2024-09-30,89.1,"Díaz, Edwin",518595,621242,ATL,NYM,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
730054,SL,2024-09-30,91.3,"Díaz, Edwin",518595,621242,ATL,NYM,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
730055,SL,2024-09-30,91.1,"Díaz, Edwin",518595,621242,ATL,NYM,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
730056,SL,2024-09-30,90.7,"Díaz, Edwin",518595,621242,ATL,NYM,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# Compute the correlation matrix
corr = filtered_data_encoded.corr()

# Set up the matplotlib figure
plt.figure(figsize=(16, 12))

# Draw the heatmap
sns.heatmap(corr, cmap='coolwarm', center=0, linewidths=0.5)

plt.title('Correlation Heatmap of Encoded Filtered Data')
plt.tight_layout()
plt.show()

In [34]:
# Find pairs of features with perfect or near-perfect correlation (>|0.98|, excluding self-pairs)
threshold = 0.98

# corr = filtered_data_encoded.corr()
corr = filtered_data_encoded.select_dtypes(exclude=['object']).corr()

# Select upper triangle of correlation matrix
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

# Find pairs with correlation above threshold or below -threshold
perfect_corr_pairs = [
    (col, row, corr_value)
    for col in upper.columns
    for row, corr_value in upper[col].items()
    if abs(corr_value) >= threshold
]

# Display the pairs
for col1, col2, corr_val in perfect_corr_pairs:
    print(f"{col1} <--> {col2}: correlation = {corr_val:.3f}")


type_X <--> description_hit_into_play: correlation = 1.000
bb_type_nan <--> description_hit_into_play: correlation = -1.000
bb_type_nan <--> type_X: correlation = -1.000


In [35]:
filtered_data_encoded = filtered_data_encoded.drop(columns=['type_X', 'bb_type_nan'], errors='ignore')


In [40]:
cols_to_list = ['description', 'stand', 'p_throws', 'type', 'bb_type', 'inning_topbot', 'zone', 'hit_location']
matching_columns = [col for col in filtered_data_encoded.columns if any(key in col for key in cols_to_list)]
print(matching_columns)


['pitch_type', 'description_ball', 'description_blocked_ball', 'description_called_strike', 'description_foul', 'description_foul_bunt', 'description_foul_tip', 'description_hit_by_pitch', 'description_hit_into_play', 'description_pitchout', 'description_swinging_strike', 'description_swinging_strike_blocked', 'stand_R', 'p_throws_R', 'type_B', 'type_S', 'bb_type_fly_ball', 'bb_type_ground_ball', 'bb_type_line_drive', 'bb_type_popup', 'inning_topbot_Top', 'zone_1.0', 'zone_2.0', 'zone_3.0', 'zone_4.0', 'zone_5.0', 'zone_6.0', 'zone_7.0', 'zone_8.0', 'zone_9.0', 'zone_11.0', 'zone_12.0', 'zone_13.0', 'zone_14.0', 'zone_nan', 'hit_location_1.0', 'hit_location_2.0', 'hit_location_3.0', 'hit_location_4.0', 'hit_location_5.0', 'hit_location_6.0', 'hit_location_7.0', 'hit_location_8.0', 'hit_location_9.0', 'hit_location_nan']


In [45]:
# Features known BEFORE pitch is thrown
context_features = [
    'batter', 'pitcher', 'stand_R', 'p_throws_R', 'balls', 'strikes',

    'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot_Top', 'at_bat_number', 'pitch_number',
            'home_score', 'away_score', 'bat_score', 'fld_score', 'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat', 
            'pitcher_days_since_prev_game', 'batter_days_since_prev_game', 'pitcher_days_until_next_game', 
            'batter_days_until_next_game', 'prev_runs_scored'
]
        
# Features from PREVIOUS pitches (memory)
memory_features = [
    'pitch_type', 'release_speed', 'launch_speed', 'launch_angle', 'release_pos_y',

    'description_ball', 'description_blocked_ball', 'description_called_strike', 'description_foul', 
    'description_foul_bunt', 'description_foul_tip', 'description_hit_by_pitch', 'description_hit_into_play', 
    'description_pitchout', 'description_swinging_strike', 'description_swinging_strike_blocked', 
    'type_B', 'type_S', 'bb_type_fly_ball', 'bb_type_ground_ball', 'bb_type_line_drive', 'bb_type_popup', 
    'zone_1.0', 'zone_2.0', 'zone_3.0', 'zone_4.0', 'zone_5.0', 'zone_6.0', 'zone_7.0', 'zone_8.0', 
    'zone_9.0', 'zone_11.0', 'zone_12.0', 'zone_13.0', 'zone_14.0', 'zone_nan', 'hit_location_1.0', 'hit_location_2.0', 
    'hit_location_3.0', 'hit_location_4.0', 'hit_location_5.0', 'hit_location_6.0', 'hit_location_7.0', 'hit_location_8.0', 
    'hit_location_9.0', 'hit_location_nan'
]
len(memory_features) + len(context_features)

71

In [47]:
# List columns that aren't in memory_features or context_features
all_features = set(filtered_data_encoded.columns)
used_features = set(context_features) | set(memory_features)
unused_features = [col for col in filtered_data_encoded.columns if col not in used_features]
print("Columns not in memory_features or context_features:", unused_features)


Columns not in memory_features or context_features: ['game_date', 'player_name', 'home_team', 'away_team', 'game_pk', 'pitch_name', 'plate_app_id']


In [43]:
object_columns = filtered_data_encoded.select_dtypes(include='object').columns.tolist()
print("Object columns:", object_columns)


Object columns: ['pitch_type', 'game_date', 'player_name', 'home_team', 'away_team', 'pitch_name', 'plate_app_id']


In [49]:
# Check for any null or NaN values in the dataset
null_counts = filtered_data_encoded.isnull().sum()
print("Null values per column:")
print(null_counts[null_counts > 0])

# Also check if there are any NaNs at all
total_nulls = filtered_data_encoded.isnull().sum().sum()
print(f"Total null/NaN values in DataFrame: {total_nulls}")


Null values per column:
pitch_type                        6867
release_speed                     8070
launch_speed                    488495
launch_angle                    488178
release_pos_y                     8069
pitch_name                        6867
pitcher_days_since_prev_game     55457
batter_days_since_prev_game      28899
pitcher_days_until_next_game     48365
batter_days_until_next_game      26741
dtype: int64
Total null/NaN values in DataFrame: 1166008


In [50]:
# Drop columns with too many missing values
cols_to_drop = [
    'pitcher_days_since_prev_game',
    'batter_days_since_prev_game',
    'pitcher_days_until_next_game',
    'batter_days_until_next_game',
    'launch_speed', 'launch_angle'
]
filtered_data_encoded = filtered_data_encoded.drop(columns=cols_to_drop)

# Drop rows where pitch_type, release_speed, or release_pos_y are null
filtered_data_encoded = filtered_data_encoded.dropna(subset=['pitch_type', 'release_speed', 'release_pos_y'])


In [52]:
filtered_data_encoded.to_csv('encoded_savant_2024.csv')

In [None]:
 # Features known BEFORE pitch is thrown
context_features = [
    'batter', 'pitcher', 'stand_R', 'p_throws_R', 'balls', 'strikes',

    'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot_Top', 'at_bat_number', 'pitch_number',
    'home_score', 'away_score', 'bat_score', 'fld_score', 'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat',
    'prev_runs_scored'
]
        
# Features from PREVIOUS pitches (memory)
memory_features = [
    'pitch_type', 'release_speed', 'release_pos_y',

    'description_ball', 'description_blocked_ball', 'description_called_strike', 'description_foul', 
    'description_foul_bunt', 'description_foul_tip', 'description_hit_by_pitch', 'description_hit_into_play', 
    'description_pitchout', 'description_swinging_strike', 'description_swinging_strike_blocked', 
    'type_B', 'type_S', 'bb_type_fly_ball', 'bb_type_ground_ball', 'bb_type_line_drive', 'bb_type_popup', 
    'zone_1.0', 'zone_2.0', 'zone_3.0', 'zone_4.0', 'zone_5.0', 'zone_6.0', 'zone_7.0', 'zone_8.0', 
    'zone_9.0', 'zone_11.0', 'zone_12.0', 'zone_13.0', 'zone_14.0', 'zone_nan', 'hit_location_1.0', 'hit_location_2.0', 
    'hit_location_3.0', 'hit_location_4.0', 'hit_location_5.0', 'hit_location_6.0', 'hit_location_7.0', 'hit_location_8.0', 
    'hit_location_9.0', 'hit_location_nan'
]

In [53]:
# Check for any null or NaN values in the dataset
null_counts = filtered_data_encoded.isnull().sum()
print("Null values per column:")
print(null_counts[null_counts > 0])

# Also check if there are any NaNs at all
total_nulls = filtered_data_encoded.isnull().sum().sum()
print(f"Total null/NaN values in DataFrame: {total_nulls}")

Null values per column:
Series([], dtype: int64)
Total null/NaN values in DataFrame: 0


In [55]:
num_unique_batters = filtered_data_encoded['batter'].nunique()
num_unique_pitchers = filtered_data_encoded['pitcher'].nunique()
print(f"Number of unique batters: {num_unique_batters}")
print(f"Number of unique pitchers: {num_unique_pitchers}")
num_unique_pitch_types = filtered_data_encoded['pitch_type'].nunique()
print(f"Number of unique pitch types: {num_unique_pitch_types}")



Number of unique batters: 971
Number of unique pitchers: 974
Number of unique pitch types: 17


In [58]:
# Map pitch_type to pitch_type_group
pitch_type_map = {
    'FF': 'FAST', 'SI': 'FAST', 'FC': 'FAST',
    'CH': 'OFF', 'FS': 'OFF', 'FO': 'OFF',
    'CU': 'BREAK', 'KC': 'BREAK', 'CS': 'BREAK', 'KN': 'BREAK', 'SL': 'BREAK', 'ST': 'BREAK', 'SV': 'BREAK',
    'EP': 'OTH', 'FA': 'OTH', 'IN': 'OTH', 'PO': 'OTH'
}

filtered_data_encoded['pitch_type_group'] = filtered_data_encoded['pitch_type'].map(pitch_type_map)
print(filtered_data_encoded['pitch_type_group'].value_counts(dropna=False))


pitch_type_group
FAST     403091
BREAK    221339
OFF       96141
OTH        1262
NaN         155
Name: count, dtype: int64


In [None]:
filtered_data_encoded = filtered_data_encoded.dropna(subset=['pitch_type_group'])

In [65]:
from sklearn.preprocessing import StandardScaler

# Select columns to normalize
cols_to_normalize = ['release_speed', 'release_pos_y']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the selected columns
filtered_data_encoded[cols_to_normalize] = scaler.fit_transform(filtered_data_encoded[cols_to_normalize])

# Check the result
print(filtered_data_encoded[cols_to_normalize].describe())

       release_speed  release_pos_y
count   7.218330e+05   7.218330e+05
mean   -1.614349e-15  -1.831278e-14
std     1.000001e+00   1.000001e+00
min    -9.580467e+00  -5.000071e+00
25%    -7.111089e-01  -6.420656e-01
50%     1.423577e-01  -2.887896e-02
75%     8.117433e-01   6.281067e-01
max     2.736227e+00   7.154164e+00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_encoded[cols_to_normalize] = scaler.fit_transform(filtered_data_encoded[cols_to_normalize])


In [66]:
filtered_data_encoded.to_csv('final_encoded_savant_2024.csv')

In [78]:
import numpy as np

# List of context features from model.py
context_features = [
    'stand_R', 'p_throws_R', 'balls', 'strikes',
    'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot_Top', 'at_bat_number', 'pitch_number',
    'home_score', 'away_score', 'bat_score', 'fld_score', 'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat',
    'prev_runs_scored'
]

# Sample a random row with these features from filtered_data_encoded
list(filtered_data_encoded.iloc[100][context_features].values)


[np.float64(0.0),
 np.float64(1.0),
 np.int64(0),
 np.int64(2),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.int64(3),
 np.float64(0.0),
 np.int64(23),
 np.int64(3),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(0),
 np.float64(0.0)]

In [None]:
df = pd.read_csv('final_encoded_savant_2024.csv')
from sklearn.preprocessing import OneHotEncoder

# One-hot encode 'pitch_type_group' column
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='if_binary')
pitch_type_group_encoded = encoder.fit_transform(df[['pitch_type_group']])

# Get the new column names
encoded_col_names = [f'pitch_type_group_{cat}' for cat in encoder.categories_[0]]

# Add the new columns to df
for i, col in enumerate(encoded_col_names):
    df[col] = pitch_type_group_encoded[:, i]

# Optionally, drop the original column if you don't need it
df = df.drop(columns=['pitch_type_group_OTH', 'Unnamed: 0'])
df.to_csv('final_final_encoded_savant_2024.csv')

In [104]:
print("Data types of context_features elements:")
for i, val in enumerate(filtered_data_encoded.iloc[100][context_features].values):
    print(f"Index {i}: {type(val)}")


Data types of context_features elements:
Index 0: <class 'numpy.float64'>
Index 1: <class 'numpy.float64'>
Index 2: <class 'numpy.int64'>
Index 3: <class 'numpy.int64'>
Index 4: <class 'numpy.int64'>
Index 5: <class 'numpy.int64'>
Index 6: <class 'numpy.int64'>
Index 7: <class 'numpy.int64'>
Index 8: <class 'numpy.int64'>
Index 9: <class 'numpy.float64'>
Index 10: <class 'numpy.int64'>
Index 11: <class 'numpy.int64'>
Index 12: <class 'numpy.int64'>
Index 13: <class 'numpy.int64'>
Index 14: <class 'numpy.int64'>
Index 15: <class 'numpy.int64'>
Index 16: <class 'numpy.int64'>
Index 17: <class 'numpy.int64'>
Index 18: <class 'numpy.float64'>


### Pybaseball Pitcher/Batter Stats

In [38]:
from pybaseball import pitching_stats_bref

# retrieve data on the 2009 season
dat = pitching_stats_bref(2023)

In [70]:
pitching_stats_23 = dat[['mlbID', '#days', 'Age', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W']].fillna(0)

In [42]:
from pybaseball import batting_stats_bref

# retrieve data on the 2009 season
dat_b = batting_stats_bref(2024)

In [71]:
batting_stats_23 = dat_b[['mlbID', 'Age', '#days', 'G', 'PA', 'AB', 'R', 'H', '2B',
       '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB',
       'CS', 'BA', 'OBP', 'SLG', 'OPS']].fillna(0)

In [72]:
from sklearn.preprocessing import StandardScaler

pitcher_cols_to_normalize = ['#days', 'Age', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W']

batter_cols_to_normalize = ['Age', '#days', 'G', 'PA', 'AB', 'R', 'H', '2B',
       '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB',
       'CS', 'BA', 'OBP', 'SLG', 'OPS']

scaler = StandardScaler()

# Fit and transform the selected columns
pitching_stats_23[pitcher_cols_to_normalize] = scaler.fit_transform(pitching_stats_23[pitcher_cols_to_normalize])
batting_stats_23[batter_cols_to_normalize] = scaler.fit_transform(batting_stats_23[batter_cols_to_normalize])

In [75]:
pitching_stats_23.to_csv('pitching_stats_23.csv', index=False)
batting_stats_23.to_csv('batting_stats_23.csv', index=False)

In [None]:
df = pd.read_csv('./data/final_final_encoded_savant_2024.csv')
df.head(50)

In [6]:
df.loc[:,'player_game_id'] =  df['game_pk'].astype(str) + df['pitcher'].astype(str)
df.to_csv('player_game_savant.csv')

### Pitch Prediction Accuracy Validation (Plate Memory -- Live Data Only)

In [12]:
import pickle
import torch
import model

# Load model config/specs
with open("./models/pitch_predictor_lstm_meta1.pkl", "rb") as f:
    meta = pickle.load(f)

# Instantiate the model using the loaded specs
loaded_mod = model.PitchPredictorLSTM(
    context_dim=meta["lstm_init_args"]['context_dim'],
    memory_dim=meta["lstm_init_args"]['memory_dim'],
    num_pitchers=len(meta["pitcher_to_id"]),
    num_batters=len(meta["batter_to_id"]),
    pitcher_embed_dim=16,
    batter_embed_dim=16,
    lstm_hidden_dim=128,
    num_pitch_types=4
)

# Load the trained weights
loaded_mod.load_state_dict(torch.load("./models/pitch_predictor_lstm1.pth", map_location="cpu"))
loaded_mod.eval()


PitchPredictorLSTM(
  (pitcher_feat_embed): Sequential(
    (0): Linear(in_features=37, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
  )
  (batter_feat_embed): Sequential(
    (0): Linear(in_features=24, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
  )
  (lstm): LSTM(18, 128, num_layers=2, batch_first=True, dropout=0.2)
  (context_fc): Sequential(
    (0): Linear(in_features=47, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
  )
  (combined_fc): Sequential(
    (0): Linear(in_features=192, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Linear(in_features=64, out_features=4, bias=True)
  )
)

In [30]:
import model
batter_df = pd.read_csv('./data/batting_stats_23.csv')
pitcher_df = pd.read_csv('./data/pitching_stats_23.csv')
with open('./data/val_dataset_emb.pkl', 'rb') as f:
            val_dataset = pickle.load(f)
            
pitcher_stat_cols = ['#days', 'Age', 'G', 'GS', 'W', 'L', 'SV', 'IP', 'H', 'R', 'ER', 'BB', 
                                  'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB', 'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 
                                  'Str', 'StL', 'StS', 'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W']
        
batter_stat_cols = ['#days', 'Age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 
                            'SH', 'SF', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS']
count = 0
correct = 0
non_fast_pred_count = 0
for context, pitcher_id, batter_id, memory_seq, pitcher_stats, batter_stats, target in val_dataset:

    pitcher_row = pitcher_df[pitcher_df['mlbID'] == pitcher_id]
    if not pitcher_row.empty:
        pitcher_stats_vec = torch.FloatTensor(pitcher_row[pitcher_stat_cols].values[0])
    else:
        pitcher_stats_vec = torch.zeros(len(pitcher_stat_cols))

    batter_row = batter_df[batter_df['mlbID'] == batter_id]
    if not batter_row.empty:
        batter_stats_vec = torch.FloatTensor(batter_row[batter_stat_cols].values[0])
    else:
        batter_stats_vec = torch.zeros(len(batter_stat_cols))

    probs = model.predict_next_pitch(loaded_mod, current_context=context, pitcher_id=[pitcher_id.item()], 
                    batter_id=[batter_id.item()], previous_pitches_in_pa=memory_seq,
                    pitcher_stats=pitcher_stats_vec.unsqueeze(0), batter_stats=batter_stats_vec.unsqueeze(0))
    

    idx_to_pitch_type = {v: k for k, v in meta['pitch_type_to_idx'].items()}
    probs = {idx_to_pitch_type[i]: float(probs[0, i]) for i in range(probs.shape[1])}
    # pitch_probs

    highest_key = ''
    highest_prob = 0
    #print(probs)
    for key in probs:
        if probs[key] > highest_prob:
            highest_prob = probs[key]
            highest_key = key
    
    pitch_types = ['FAST', 'OFF', 'BREAK', 'OTH']
    print(f'Actual: {pitch_types[target]} Predicted: {highest_key} with prob {highest_prob: 0.4f}')
    count+=1
    if (highest_key != 'FAST'):
        non_fast_pred_count += 1
    if (pitch_types[target] == highest_key):
        correct += 1

Actual: FAST Predicted: FAST with prob  0.7308
Actual: FAST Predicted: FAST with prob  0.6766
Actual: FAST Predicted: FAST with prob  0.5463
Actual: BREAK Predicted: FAST with prob  0.5706
Actual: FAST Predicted: FAST with prob  0.4845
Actual: OFF Predicted: FAST with prob  0.5091
Actual: OFF Predicted: FAST with prob  0.5975
Actual: BREAK Predicted: FAST with prob  0.5433
Actual: BREAK Predicted: FAST with prob  0.4740
Actual: OFF Predicted: FAST with prob  0.5326
Actual: BREAK Predicted: FAST with prob  0.5262
Actual: FAST Predicted: FAST with prob  0.5802
Actual: BREAK Predicted: FAST with prob  0.5143
Actual: FAST Predicted: FAST with prob  0.5889
Actual: BREAK Predicted: FAST with prob  0.4866
Actual: FAST Predicted: FAST with prob  0.7032
Actual: FAST Predicted: FAST with prob  0.4741
Actual: FAST Predicted: FAST with prob  0.4731
Actual: FAST Predicted: FAST with prob  0.6974
Actual: BREAK Predicted: FAST with prob  0.6117
Actual: BREAK Predicted: FAST with prob  0.5356
Actual: 

In [31]:
correct/count

0.5657145481263777

In [32]:
non_fast_pred_count/count

0.023902461425422482

### More Potential Batting Data

In [None]:
from pybaseball import statcast_pitcher_pitch_arsenal

# get average pitch speed data for all qualified pitchers in 2023
data = statcast_pitcher_pitch_arsenal(2023)
data['pitcher'].unique()

In [54]:
pitch_arsenal_df = pd.read_csv('./data/pitch-arsenal-stats.csv')

In [52]:
pitch_arsenal_df['player_id'].unique()

array([642547, 669203, 669022, 592332, 656302, 668881, 607074, 684007,
       650911, 666142, 664285, 571578, 680730, 669194, 605135, 663903,
       657006, 690986, 571760, 657277, 671096, 450203, 661563, 605400,
       579328, 542881, 622491, 678394, 519242, 676979, 663559, 543135,
       656427, 601713, 676440, 656557, 554430, 625643, 641154, 663623,
       623167, 621244, 676664, 641482, 668678, 682243, 669923, 544150,
       669302, 686752, 641927, 668964, 572020, 676710, 656288, 665871,
       663372, 663855, 657746, 622663, 593423, 681293, 667755, 683155,
       608331, 683003, 543294, 608379, 500779, 596295, 669467, 607192,
       687765, 671737, 570632, 615698, 686613, 650633, 669373, 624133,
       641793, 669854, 670102, 573186, 607200, 694297, 640455, 680573,
       650644, 527048, 663362, 680570, 458681, 657612, 607067, 594902,
       434378, 694973, 656629, 621107, 693433, 686799, 671922, 680767,
       656222, 571945, 592351, 661403, 502043, 663474, 656605, 668933,
      

In [57]:
for pitcher_id, pitcher_data in pitch_arsenal_df.groupby('player_id'):
    print(pitcher_id)
    #print(pitcher_data.columns)
    print(pitcher_data[['last_name, first_name', 'pitch_type',
                    'run_value', 'pitch_usage']])

425794
     last_name, first_name pitch_type  run_value  pitch_usage
255       Wainwright, Adam         CU         -7         30.8
264       Wainwright, Adam         SI        -24         31.4
595       Wainwright, Adam         FC         -2         22.6
1188      Wainwright, Adam         FF         -4         10.0
2024      Wainwright, Adam         CH         -3          5.0
425844
     last_name, first_name pitch_type  run_value  pitch_usage
339          Greinke, Zack         FF          2         26.7
409          Greinke, Zack         SL          7         20.0
496          Greinke, Zack         CH          1         16.5
663          Greinke, Zack         SI         -4         15.6
863          Greinke, Zack         CU         -9         14.1
1200         Greinke, Zack         FC         -1          7.0
434378
     last_name, first_name pitch_type  run_value  pitch_usage
21       Verlander, Justin         FF          9         50.0
174      Verlander, Justin         SL          8 

In [56]:
# Get unique values from both columns
pitchers_unique = set(data['pitcher'].unique())
pitch_arsenal_unique = set(pitch_arsenal_df['player_id'].unique())
big_df_unique = set(pd.read_csv('./data/final_final_encoded_savant_2024.csv')['pitcher'].unique())

# Compare if they have the same elements
same_contents = big_df_unique == pitch_arsenal_unique

print(f"Do they have the same unique values? {same_contents}")

# If you want to see what is different:
only_in_pitchers = big_df_unique - pitch_arsenal_unique
only_in_arsenal = pitch_arsenal_unique - big_df_unique

print(f"Values only in data['pitcher']: {only_in_pitchers}")
print(f"Values only in pitch_arsenal_df['player_id']: {only_in_arsenal}")


Do they have the same unique values? False
Values only in data['pitcher']: {np.int64(669697), np.int64(663554), np.int64(692230), np.int64(808967), np.int64(808970), np.int64(595978), np.int64(593934), np.int64(663568), np.int64(663574), np.int64(669724), np.int64(661536), np.int64(661563), np.int64(692285), np.int64(694335), np.int64(677955), np.int64(677958), np.int64(571466), np.int64(667725), np.int64(694350), np.int64(675921), np.int64(694357), np.int64(677976), np.int64(694361), np.int64(682073), np.int64(688230), np.int64(622694), np.int64(694381), np.int64(641149), np.int64(678020), np.int64(571527), np.int64(675976), np.int64(673929), np.int64(596117), np.int64(680089), np.int64(542882), np.int64(694436), np.int64(688297), np.int64(671922), np.int64(547001), np.int64(694462), np.int64(622786), np.int64(696522), np.int64(702674), np.int64(663767), np.int64(657624), np.int64(665828), np.int64(657649), np.int64(663795), np.int64(700669), np.int64(676105), np.int64(571657), np.int