In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pybaseball import statcast_batter, statcast_pitcher
from pybaseball import statcast_pitcher_percentile_ranks
from pybaseball import pitching_stats
from pybaseball import playerid_lookup, playerid_reverse_lookup
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import NormalPredictor, BaselineOnly
from surprise.prediction_algorithms import SVD, SVDpp, NMF
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV, cross_validate

In [16]:
# Reading in data from clustering
## CSV all 2023 pitches from pitchers with at least 10 innings pitched.
df = pd.read_csv('../data/pitcher_data.zip')
df.shape

(610225, 92)

In [17]:
df.head(10)

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,SI,2023-06-18,85.8,-3.79,2.01,"Cimber, Adam",543760,643256,double,hit_into_play,...,7,7,10,10,7,Standard,Standard,283.0,0.01,0.38
1,SI,2023-06-18,84.9,-3.68,2.0,"Cimber, Adam",543760,643256,,foul,...,7,7,10,10,7,Standard,Standard,283.0,0.0,0.0
2,SL,2023-06-18,75.5,-3.99,1.91,"Cimber, Adam",543760,643256,,ball,...,7,7,10,10,7,Standard,Standard,115.0,0.0,0.059
3,FF,2023-06-18,87.0,-3.82,2.09,"Cimber, Adam",543760,643256,,foul,...,7,7,10,10,7,Standard,Standard,286.0,0.0,0.0
4,FF,2023-06-18,86.4,-3.75,2.02,"Cimber, Adam",543760,643256,,foul,...,7,7,10,10,7,Standard,Standard,275.0,0.0,0.0
5,FF,2023-06-18,86.8,-3.8,2.16,"Cimber, Adam",543760,643256,,ball,...,7,7,10,10,7,Standard,Standard,284.0,0.0,0.029
6,SL,2023-06-18,74.6,-3.91,1.82,"Cimber, Adam",543760,643256,,foul,...,7,7,10,10,7,Standard,Standard,109.0,0.0,-0.043
7,SL,2023-06-18,74.1,-3.9,1.82,"Cimber, Adam",543760,643256,,ball,...,7,7,10,10,7,Standard,Standard,115.0,0.0,0.019
8,SI,2023-06-18,85.9,-3.77,2.0,"Cimber, Adam",543760,643256,,foul,...,7,7,10,10,7,Infield shade,Standard,283.0,0.0,-0.026
9,FF,2023-06-18,86.0,-3.9,2.02,"Cimber, Adam",665750,643256,home_run,hit_into_play,...,7,7,10,10,7,Standard,Standard,282.0,0.145,1.638


In [18]:
df['events'].value_counts()

events
field_out                       61883
strikeout                       35370
single                          22028
walk                            12865
double                           7005
home_run                         4982
force_out                        3049
grounded_into_double_play        2940
hit_by_pitch                     1747
sac_fly                          1008
field_error                       906
triple                            607
sac_bunt                          348
double_play                       330
fielders_choice                   309
fielders_choice_out               242
caught_stealing_2b                166
strikeout_double_play              90
catcher_interf                     80
sac_fly_double_play                21
other_out                          18
pickoff_1b                         12
caught_stealing_home                9
caught_stealing_3b                  9
pickoff_caught_stealing_2b          7
wild_pitch                          5
picko

In [19]:
df['events'].unique()

array(['double', nan, 'home_run', 'single', 'field_out',
       'caught_stealing_2b', 'field_error', 'strikeout',
       'grounded_into_double_play', 'hit_by_pitch', 'sac_fly', 'walk',
       'force_out', 'fielders_choice', 'double_play',
       'sac_fly_double_play', 'triple', 'fielders_choice_out', 'sac_bunt',
       'caught_stealing_home', 'catcher_interf', 'other_out',
       'strikeout_double_play', 'caught_stealing_3b', 'pickoff_1b',
       'stolen_base_2b', 'wild_pitch', 'pickoff_2b', 'triple_play',
       'pickoff_caught_stealing_2b', 'pickoff_error_3b', 'pickoff_3b',
       'stolen_base_3b', 'pickoff_caught_stealing_home'], dtype=object)

In [20]:
df.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

In [21]:
df['woba_value'].value_counts()

woba_value
0.00    105098
0.90     23272
0.70     14760
1.25      7005
2.00      4982
1.60       607
0.20       322
Name: count, dtype: int64

In [22]:
df['woba_denom'].value_counts()

woba_denom
1.0    155172
0.0       579
Name: count, dtype: int64

In [23]:
df.loc[df['woba_denom'] == 0]['events'].value_counts()

events
sac_bunt                        344
caught_stealing_2b              166
other_out                        18
pickoff_1b                       12
caught_stealing_home              9
caught_stealing_3b                9
pickoff_caught_stealing_2b        7
wild_pitch                        5
pickoff_3b                        3
stolen_base_2b                    2
pickoff_2b                        1
pickoff_error_3b                  1
stolen_base_3b                    1
pickoff_caught_stealing_home      1
Name: count, dtype: int64

In [24]:
df.loc[df['woba_value'] == 2]['events'].value_counts()

events
home_run    4982
Name: count, dtype: int64

In [25]:
df.loc[df['woba_value'] == 0.7]['events'].value_counts()

events
walk              12865
hit_by_pitch       1747
catcher_interf       80
strikeout            68
Name: count, dtype: int64

In [26]:
df.loc[(df['woba_value'] == 0.7) & (df['events'] == 'strikeout')]

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
6906,SL,2023-08-20,84.4,-1.74,6.61,"Gallegos, Giovanny",624413,606149,strikeout,swinging_strike,...,7,2,7,2,7,Standard,Standard,295.0,-0.005,0.391
12107,ST,2023-08-03,85.2,-1.53,5.52,"Weaver, Luke",663538,596133,strikeout,swinging_strike,...,1,1,1,1,1,Standard,Standard,47.0,0.014,0.179
24378,SL,2023-09-29,86.5,-1.62,6.12,"Cease, Dylan",676946,656302,strikeout,swinging_strike,...,0,0,0,0,0,Standard,Standard,106.0,-0.083,0.506
30496,CU,2023-04-30,81.2,-0.82,6.54,"Staumont, Josh",596146,622251,strikeout,swinging_strike,...,0,0,8,8,0,Infield shade,Standard,10.0,0.001,0.363
31945,SL,2023-07-18,83.3,-0.83,6.47,"Elder, Bryce",682998,693821,strikeout,swinging_strike,...,5,6,5,6,5,Standard,Standard,13.0,-0.136,0.642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554171,SL,2023-04-14,79.6,-1.45,5.79,"Clevinger, Mike",602104,605182,strikeout,swinging_strike,...,1,0,1,0,1,Standard,Standard,50.0,-0.029,0.306
561789,KC,2023-05-16,86.1,-1.77,5.67,"Cobb, Alex",607208,502171,strikeout,swinging_strike,...,2,2,2,2,2,Strategic,Standard,32.0,-0.016,0.741
568342,SL,2023-04-09,85.2,-2.30,5.16,"Bickford, Phil",677950,641360,strikeout,swinging_strike,...,5,5,11,11,5,Infield shade,Standard,80.0,0.000,0.124
581858,SL,2023-08-03,90.1,0.29,6.13,"Chapman, Aroldis",572204,547973,strikeout,swinging_strike,...,4,3,4,3,4,Standard,Strategic,269.0,-0.036,0.539


In [27]:
df.loc[df['woba_value'] == 0.2]['events'].value_counts()

events
sac_bunt    322
Name: count, dtype: int64

## Redoing pitch level dataframe to have pitchers with over 100

In [28]:
pitching_stats_df = pitching_stats(2023, qual=10)
pitching_stats_df.shape

(659, 393)

In [29]:
pitching_stats_df1 = pitching_stats(2023, qual=75)
pitching_stats_df1.shape

(184, 393)

In [30]:
""" pitching_stats_df1 = pitching_stats(2023, qual=75)

# Iterate through the names and print them
selected_pitchers = []
for name in pitching_stats_df1['Name']:
    selected_pitchers.append(name)

last_selected_pitchers = [name.split()[-1] for name in selected_pitchers]
first_selected_pitchers = [name.split()[0] for name in selected_pitchers]

list = zip(first_selected_pitchers, last_selected_pitchers)
result = set(list)

player_ids = []

# Iterate through the list of player name tuples
for first_name, last_name in result:
    try:
        # Use playerid_lookup function with first and last names
        player_id_df = playerid_lookup(last_name, first_name)

        # Check if the DataFrame is not empty and add to the list
        if player_id_df is not None and not player_id_df.empty:
            player_ids.append(player_id_df)
    except Exception as e:
        print(f"Error fetching player ID for {first_name} {last_name}: {e}")

# Optional: Combine all the individual DataFrames into one
combined_player_ids = pd.concat(player_ids, ignore_index=True)

combined_player_ids = combined_player_ids['key_mlbam']

combined_df = pd.DataFrame()

# Iterate through each pitcher's name and fetch their data
for id in combined_player_ids:
    try:
        # Fetch data for the pitcher
        pitcher_df = statcast_pitcher('2023-04-01', '2023-09-30', id)

        # Check if the DataFrame is not empty
        if pitcher_df is not None and not pitcher_df.empty:
            # Append the data to the combined DataFrame
            combined_df = pd.concat([combined_df, pitcher_df])
    except Exception as e:
        print(f"Error fetching data for {name}: {e}") """

' pitching_stats_df1 = pitching_stats(2023, qual=75)\n\n# Iterate through the names and print them\nselected_pitchers = []\nfor name in pitching_stats_df1[\'Name\']:\n    selected_pitchers.append(name)\n\nlast_selected_pitchers = [name.split()[-1] for name in selected_pitchers]\nfirst_selected_pitchers = [name.split()[0] for name in selected_pitchers]\n\nlist = zip(first_selected_pitchers, last_selected_pitchers)\nresult = set(list)\n\nplayer_ids = []\n\n# Iterate through the list of player name tuples\nfor first_name, last_name in result:\n    try:\n        # Use playerid_lookup function with first and last names\n        player_id_df = playerid_lookup(last_name, first_name)\n\n        # Check if the DataFrame is not empty and add to the list\n        if player_id_df is not None and not player_id_df.empty:\n            player_ids.append(player_id_df)\n    except Exception as e:\n        print(f"Error fetching player ID for {first_name} {last_name}: {e}")\n\n# Optional: Combine all the

In [31]:
def fetch_pitcher_data_for_years(years):
    combined_df = pd.DataFrame()

    for year in years:
        # Fetch pitching stats for the given year
        pitching_stats_df = pitching_stats(year, qual=75)

        # Extract first and last names
        last_names = [name.split()[-1] for name in pitching_stats_df['Name']]
        first_names = [name.split()[0] for name in pitching_stats_df['Name']]
        name_tuples = zip(first_names, last_names)
        result = set(name_tuples)

        player_ids = []

        # Fetch player IDs
        for first_name, last_name in result:
            try:
                player_id_df = playerid_lookup(last_name, first_name)
                if player_id_df is not None and not player_id_df.empty:
                    player_ids.append(player_id_df)
            except Exception as e:
                print(f"Error fetching player ID for {first_name} {last_name}: {e}")

        # Combine individual DataFrames
        combined_player_ids = pd.concat(player_ids, ignore_index=True)['key_mlbam']

        # Fetch data for each pitcher
        for id in combined_player_ids:
            try:
                pitcher_df = statcast_pitcher(f'{year}-04-01', f'{year}-09-30', id)
                if pitcher_df is not None and not pitcher_df.empty:
                    combined_df = pd.concat([combined_df, pitcher_df])
            except Exception as e:
                print(f"Error fetching data for {id}: {e}")

    return combined_df

# Example usage
years = [2021, 2022, 2023] 
combined_data = fetch_pitcher_data_for_years(years)


Gathering player lookup table. This may take a moment.
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data

In [32]:
combined_data.to_csv('../data/pitch_data_21_22_23.zip', index=False)

In [4]:
combined_data = pd.read_csv('../data/pitch_data_21_22_23.zip')

In [5]:
combined_data.shape

(978564, 92)

In [6]:
combined_data['events'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 978564 entries, 0 to 978563
Series name: events
Non-Null Count   Dtype 
--------------   ----- 
252509 non-null  object
dtypes: object(1)
memory usage: 7.5+ MB


In [7]:
combined_data_filtered = combined_data.dropna(subset=['events'])
combined_data_filtered.shape

(252509, 92)

In [8]:
combined_data_filtered['events'].unique()

array(['strikeout', 'field_out', 'home_run', 'single', 'hit_by_pitch',
       'walk', 'grounded_into_double_play', 'double', 'field_error',
       'force_out', 'double_play', 'sac_fly', 'sac_bunt',
       'strikeout_double_play', 'triple', 'caught_stealing_2b',
       'fielders_choice', 'fielders_choice_out', 'other_out',
       'catcher_interf', 'pickoff_1b', 'sac_bunt_double_play',
       'pickoff_caught_stealing_2b', 'sac_fly_double_play', 'pickoff_2b',
       'caught_stealing_3b', 'caught_stealing_home', 'pickoff_3b',
       'triple_play', 'stolen_base_2b', 'pickoff_caught_stealing_3b',
       'pickoff_caught_stealing_home'], dtype=object)

In [9]:
events_to_keep = ['home_run', 'walk', 'single', 'double', 'hit_by_pitch', 'sac_fly', 'field_out', 'strikeout', 'triple', 
                  'double_play', 'force_out', 'grounded_into_double_play', 'strikeout_double_play', 'fielders_choice', 
                  'fielders_choice_out', 'sac_fly_double_play', 'triple_play', 'field_error'  
                  ]
combined_data_filtered = combined_data_filtered[combined_data_filtered['events'].isin(events_to_keep)]


In [10]:
combined_data_filtered['events'].unique()

array(['strikeout', 'field_out', 'home_run', 'single', 'hit_by_pitch',
       'walk', 'grounded_into_double_play', 'double', 'field_error',
       'force_out', 'double_play', 'sac_fly', 'strikeout_double_play',
       'triple', 'fielders_choice', 'fielders_choice_out',
       'sac_fly_double_play', 'triple_play'], dtype=object)

In [11]:
combined_data_filtered.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,FF,2021-09-26,99.2,-2.15,5.88,"Ohtani, Shohei",641343,660271,strikeout,foul_tip,...,1,1,1,1,1,Infield shift,Standard,214.0,0.019,-0.073
5,ST,2021-09-26,85.2,-2.27,5.79,"Ohtani, Shohei",608596,660271,field_out,hit_into_play,...,1,1,1,1,1,Infield shift,Standard,70.0,0.028,-0.189
7,ST,2021-09-26,85.3,-2.3,5.71,"Ohtani, Shohei",672284,660271,home_run,hit_into_play,...,1,1,1,1,1,Infield shift,Standard,76.0,-0.204,1.006
10,FS,2021-09-26,90.6,-2.15,5.9,"Ohtani, Shohei",641584,660271,strikeout,called_strike,...,1,0,1,0,1,Infield shift,Standard,240.0,0.043,-0.174
14,ST,2021-09-26,85.0,-2.29,5.8,"Ohtani, Shohei",647351,660271,field_out,hit_into_play,...,1,0,1,0,1,Infield shift,Standard,78.0,0.071,-0.524


In [12]:
combined_data_filtered['batter'].value_counts()

batter
543760    1078
518692    1021
502671    1015
665489    1008
621566     973
          ... 
691191       1
691284       1
695508       1
692036       1
680672       1
Name: count, Length: 1300, dtype: int64

Now we need to drop rows where the batters have seen the pitcher fewer than 10 times

In [13]:
batter_counts = pd.DataFrame(combined_data_filtered['batter'].value_counts())
batter_counts = batter_counts.loc[batter_counts['count'] > 10]
combined_data_filtered = combined_data_filtered[combined_data_filtered['batter'].isin(batter_counts.index)]


In [14]:
combined_data_filtered.shape

(250237, 92)

In [15]:
cols = ['batter', 'pitcher', 'events']
combined_data_filtered = combined_data_filtered[cols]

In [16]:
combined_data_filtered

Unnamed: 0,batter,pitcher,events
0,641343,660271,strikeout
5,608596,660271,field_out
7,672284,660271,home_run
10,641584,660271,strikeout
14,647351,660271,field_out
...,...,...,...
978539,630105,669194,walk
978545,593428,669194,field_out
978550,592518,669194,field_out
978556,665742,669194,home_run


In [17]:
def woba_value(events):
    if events == 'home_run':
        return 2.014
    elif events == 'triple':
        return 1.575
    elif events == 'double':
        return 1.248
    elif events == 'single':
        return 0.855
    elif events == 'hit_by_pitch':
        return 0.727
    elif events == 'walk':
        return 0.697
    else:
        return 0    

combined_data_filtered['woba_value'] = combined_data_filtered['events'].apply(woba_value)

In [18]:
combined_data_filtered.groupby(['batter', 'pitcher']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,events,woba_value
batter,pitcher,Unnamed: 2_level_1,Unnamed: 3_level_1
405395,425794,field_outfield_outfield_out,0.000
405395,425844,strikeoutfield_outfield_outstrikeoutfield_outf...,0.000
405395,448179,field_outhome_run,2.014
405395,453286,singlesingle,1.710
405395,456501,strikeoutgrounded_into_double_play,0.000
...,...,...,...
807799,676664,strikeoutfield_out,0.000
807799,680694,field_outfield_outsinglegrounded_into_double_play,0.855
807799,682243,field_outdoublefield_out,1.248
807799,686610,doublestrikeoutfield_outhome_run,3.262


In [19]:
woba_denom_df = combined_data_filtered.groupby(['batter', 'pitcher', 'events']).count()

In [20]:
# Define a function to determine if an event counts as an AB, BB, SF, or HBP
def categorize_event(event):
    if event in ['single', 'double', 'triple', 'home_run', 'field_out', 'strikeout', 'etc...']:  # add all events that count as AB
        return 'AB'
    elif event == 'walk':
        return 'BB'
    elif event == 'sac_fly':
        return 'SF'
    elif event == 'hit_by_pitch':
        return 'HBP'
    else:
        return 'Other'

# Apply the function to create a new column
combined_data_filtered['event_category'] = combined_data_filtered['events'].apply(categorize_event)

# Calculate the denominator
# Sum up AB, BB, SF, and HBP for each batter-pitcher pair
woba_denominator = combined_data_filtered[combined_data_filtered['event_category'].isin(['AB', 'BB', 'SF', 'HBP'])].groupby(['batter', 'pitcher'])['event_category'].value_counts().unstack(fill_value=0)
woba_denominator['denominator'] = woba_denominator.sum(axis=1)

# Reset the index to make 'batter' and 'pitcher' columns
woba_denominator.reset_index(inplace=True)



In [21]:
woba_denominator['denominator'].describe()

count    64177.000000
mean         3.702386
std          3.152337
min          1.000000
25%          2.000000
50%          3.000000
75%          5.000000
max         41.000000
Name: denominator, dtype: float64

In [23]:
woba_numerator = combined_data_filtered.groupby(['batter', 'pitcher'])['woba_value'].sum().reset_index(name='numerator')

# Merge the numerator and denominator dataframes
woba_df = pd.merge(woba_numerator, woba_denominator, on=['batter', 'pitcher'])

# Calculate wOBA
woba_df['wOBA'] = woba_df['numerator'] / woba_df['denominator']

# Handling cases where the denominator is 0 to avoid division by zero errors
woba_df['wOBA'] = woba_df['wOBA'].fillna(0)

# Display the dataframe
woba_df


Unnamed: 0,batter,pitcher,numerator,AB,BB,HBP,SF,denominator,wOBA
0,405395,425794,0.000,3,0,0,0,3,0.0000
1,405395,425844,0.000,9,0,0,0,9,0.0000
2,405395,448179,2.014,2,0,0,0,2,1.0070
3,405395,453286,1.710,2,0,0,0,2,0.8550
4,405395,456501,0.000,1,0,0,0,1,0.0000
...,...,...,...,...,...,...,...,...,...
64172,807799,676664,0.000,2,0,0,0,2,0.0000
64173,807799,680694,0.855,3,0,0,0,3,0.2850
64174,807799,682243,1.248,3,0,0,0,3,0.4160
64175,807799,686610,3.262,4,0,0,0,4,0.8155


In [24]:
batter_vs_pitcher_woba_df = woba_df.drop(columns = ['numerator', 'AB', 'BB', 'SF', 'HBP', 'denominator'])

In [25]:
batter_vs_pitcher_woba_df

Unnamed: 0,batter,pitcher,wOBA
0,405395,425794,0.0000
1,405395,425844,0.0000
2,405395,448179,1.0070
3,405395,453286,0.8550
4,405395,456501,0.0000
...,...,...,...
64172,807799,676664,0.0000
64173,807799,680694,0.2850
64174,807799,682243,0.4160
64175,807799,686610,0.8155


In [26]:
batter_vs_pitcher_woba_df.to_csv('../data/batter_vs_pitcher_woba.zip', index=False)

In [27]:
woba_df.isna().sum()

batter         0
pitcher        0
numerator      0
AB             0
BB             0
HBP            0
SF             0
denominator    0
wOBA           0
dtype: int64

In [28]:
# Filter the DataFrame to include only rows with 5 or more ABs
filtered_woba_df = woba_df[woba_df['AB'] >= 5]

# Calculate the average wOBA for each batter
average_woba = filtered_woba_df.groupby('batter')['wOBA'].mean().reset_index()

# Find the pitcher against whom each batter had the highest wOBA
max_woba_pitcher = filtered_woba_df.loc[filtered_woba_df.groupby('batter')['wOBA'].idxmax()][['batter', 'pitcher', 'wOBA']]

# Merge the two dataframes
woba_eda_comb = pd.merge(average_woba, max_woba_pitcher, on='batter', suffixes=('_avg', '_max'))

# Sort the dataframe by average wOBA in descending order and select the top 25
top_25_batters = woba_eda_comb.sort_values(by='wOBA_avg', ascending=False).head(25)

# Rename the columns for clarity
top_25_batters.rename(columns={'pitcher': 'fav_pitcher', 'wOBA_avg': 'average_wOBA', 'wOBA_max': 'max_wOBA'}, inplace=True)

# Display the dataframe
top_25_batters


Unnamed: 0,batter,average_wOBA,fav_pitcher,max_wOBA
516,666181,0.662564,607067,0.883167
122,571718,0.609,641927,0.609
521,666310,0.600857,663903,0.600857
441,661531,0.5738,596001,0.5738
663,682848,0.5646,605400,0.5646
388,650968,0.513,608344,0.513
105,546990,0.506667,621219,0.671333
42,502210,0.4972,543101,0.5738
601,672515,0.492596,656756,0.9944
103,545361,0.490355,425844,0.982444


In [29]:
# Extract unique player IDs for batters and pitchers
unique_batter_ids = top_25_batters['batter'].unique()
unique_pitcher_ids = top_25_batters['fav_pitcher'].unique()

# Combine and find unique IDs
all_unique_ids = set(list(unique_batter_ids) + list(unique_pitcher_ids))

# Find the names of the players
player_data = playerid_reverse_lookup(list(all_unique_ids), key_type='mlbam')

# Create a mapping from player ID to player name
player_name_mapping = dict(zip(player_data['key_mlbam'], player_data['name_first'] + ' ' + player_data['name_last']))

# Replace the IDs in the DataFrame with names
top_25_batters['batter'] = top_25_batters['batter'].map(player_name_mapping)
top_25_batters['fav_pitcher'] = top_25_batters['fav_pitcher'].map(player_name_mapping)

# Display the updated dataframe
top_25_batters

Gathering player lookup table. This may take a moment.


Unnamed: 0,batter,average_wOBA,fav_pitcher,max_wOBA
516,will benson,0.662564,colin rea,0.883167
122,brian goodwin,0.609,bailey ober,0.609
521,bo naylor,0.600857,brady singer,0.600857
441,brian serven,0.5738,jakob junis,0.5738
663,endy rodríguez,0.5646,aaron nola,0.5646
388,yohel pozo,0.513,cole irvin,0.513
105,anthony alford,0.506667,alec mills,0.671333
42,josh reddick,0.4972,anthony desclafani,0.5738
601,gabriel moreno,0.492596,jordan montgomery,0.9944
103,mike trout,0.490355,zack greinke,0.982444


In [30]:
batter_vs_pitcher_woba_df

Unnamed: 0,batter,pitcher,wOBA
0,405395,425794,0.0000
1,405395,425844,0.0000
2,405395,448179,1.0070
3,405395,453286,0.8550
4,405395,456501,0.0000
...,...,...,...
64172,807799,676664,0.0000
64173,807799,680694,0.2850
64174,807799,682243,0.4160
64175,807799,686610,0.8155


In [31]:
batter_vs_pitcher_woba_df['wOBA'].describe()

count    64177.000000
mean         0.318605
std          0.338300
min          0.000000
25%          0.000000
50%          0.285000
75%          0.474667
max          2.014000
Name: wOBA, dtype: float64

### Modeling!
Starting with a dummy model and grid iterating new models to improve RMSE

In [32]:
from surprise import Reader, Dataset

In [33]:
reader = Reader(rating_scale=(0, 2.014000))
data = Dataset.load_from_df(batter_vs_pitcher_woba_df,reader)
modeling_data = data.build_full_trainset()
print('Number of users: ', modeling_data.n_users, '\n')
print('Number of items: ', modeling_data.n_items)

Number of users:  918 

Number of items:  257


In [34]:
dummy_model = NormalPredictor()

In [35]:
# Run 5-fold cross-validation and print results
cross_validate(dummy_model, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4468  0.4436  0.4437  0.4459  0.4517  0.4464  0.0030  
MAE (testset)     0.3419  0.3423  0.3423  0.3433  0.3473  0.3434  0.0020  
Fit time          0.03    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.10    0.02    0.02    0.02    0.02    0.03    0.03    


{'test_rmse': array([0.44683063, 0.44357905, 0.44368403, 0.44593734, 0.45174779]),
 'test_mae': array([0.34193482, 0.34229302, 0.34229289, 0.3433308 , 0.34728996]),
 'fit_time': (0.026107072830200195,
  0.023853778839111328,
  0.02427387237548828,
  0.020730972290039062,
  0.021393775939941406),
 'test_time': (0.09616684913635254,
  0.018305063247680664,
  0.01868605613708496,
  0.018889904022216797,
  0.01864910125732422)}

In [36]:
svd_params = {'n_factors': [20, 50, 100],
              'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD, param_grid=svd_params,n_jobs=-2)
g_s_svd.fit(data)

In [37]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.33800456269935525, 'mae': 0.26129231334177705}
{'rmse': {'n_factors': 20, 'reg_all': 0.1}, 'mae': {'n_factors': 20, 'reg_all': 0.1}}


In [38]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
('test_rmse', array([0.34961655, 0.34420742, 0.34735929, 0.33791477, 0.34087085]))
('test_mae', array([0.26779414, 0.26635854, 0.26844825, 0.26179114, 0.26238029]))
('fit_time', (0.4343438148498535, 0.4526998996734619, 0.4468989372253418, 0.4742410182952881, 0.4594449996948242))
('test_time', (1.1939237117767334, 1.203782081604004, 1.1231229305267334, 1.180737018585205, 1.1165130138397217))
-----------------------
0.34399377490156396


In [39]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [40]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([0.34218442, 0.3408963 , 0.34212713, 0.34706104, 0.34119568]))
('test_mae', array([0.2646887 , 0.26258931, 0.26466404, 0.2661258 , 0.26432764]))
('fit_time', (0.2300701141357422, 0.23146820068359375, 0.2495708465576172, 0.21898627281188965, 0.21941494941711426))
('test_time', (1.24229097366333, 1.209810733795166, 1.2051172256469727, 1.207317590713501, 1.296694040298462))


0.3426929143529489

In [44]:
svdpp = SVDpp()
cv_SVDpp = cross_validate(svdpp, data)
for i in cv_SVDpp.items():
    print(i)    

np.mean(cv_SVDpp['test_rmse'])

('test_rmse', array([0.33448208, 0.33898774, 0.34218427, 0.3417132 , 0.34152218]))
('test_mae', array([0.25823481, 0.26134577, 0.26315624, 0.26464575, 0.26427613]))
('fit_time', (2.155273199081421, 2.1164770126342773, 2.1277451515197754, 2.1425092220306396, 2.1746110916137695))
('test_time', (0.4870331287384033, 0.4787101745605469, 0.4775848388671875, 0.48217296600341797, 0.5008771419525146))


0.33977789365897165

In [45]:
NMF = NMF()
cv_NMF = cross_validate(NMF,data)
for i in cv_NMF.items():
    print(i)    

np.mean(cv_NMF['test_rmse'])

('test_rmse', array([0.34254465, 0.34090963, 0.34231331, 0.34582987, 0.3438303 ]))
('test_mae', array([0.2560463 , 0.25628624, 0.25649026, 0.25971637, 0.25986729]))
('fit_time', (0.6151978969573975, 0.4033682346343994, 0.4089481830596924, 0.431380033493042, 0.5725328922271729))
('test_time', (0.15158414840698242, 0.026921987533569336, 0.028068065643310547, 0.03718900680541992, 0.03399920463562012))


0.34308555172914684

In [46]:
svd_params1 = {'n_factors': [20, 50, 100],
              'reg_all': [0.1, .3, .5],
              'lr_all': [.005, .01, .02],
              'n_epochs': [10, 20, 30],
              'verbose': [True]
              }
g_s_svd1 = GridSearchCV(SVD, param_grid=svd_params1,n_jobs=-2)
g_s_svd1.fit(data)
print(g_s_svd1.best_score)
print(g_s_svd1.best_params)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 0
Processing epoch 1
Processing epoch 7
Processing epoch 2
Processing epoch 8
Processing epoch 9
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 0
Processing epoch 6
Processing epoch 1
Processing epoch 7
Processing epoch 2
Processing epoch 8
Processing epoch 3
Processing epoch 9
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 0
Processing epoch 4
Processing epoch 1
Processing epoch 5
Processing epoch 2
Processing epoch 6
Processing epoch 3
Processing epoch 7
Processing e

In [47]:
svd_params2 = {'n_factors': [100, 150, 200],
              'reg_all': [.5, .7, 1],
              'lr_all': [0.0001, 0.001, .005],
              'n_epochs': [10, 20, 30],
              'verbose': [True]
              }
g_s_svd2 = GridSearchCV(SVD, param_grid=svd_params2,n_jobs=-2)
g_s_svd2.fit(data)
print(g_s_svd2.best_score)
print(g_s_svd2.best_params)

Processing epoch 0
Processing epoch 1
Processing epoch 0
Processing epoch 2
Processing epoch 1
Processing epoch 3
Processing epoch 0
Processing epoch 4
Processing epoch 2
Processing epoch 1
Processing epoch 3
Processing epoch 5
Processing epoch 2
Processing epoch 0
Processing epoch 6
Processing epoch 4
Processing epoch 3
Processing epoch 7
Processing epoch 1
Processing epoch 0
Processing epoch 8
Processing epoch 4
Processing epoch 2
Processing epoch 5
Processing epoch 1
Processing epoch 9
Processing epoch 5
Processing epoch 3
Processing epoch 2
Processing epoch 6
Processing epoch 4
Processing epoch 3
Processing epoch 7
Processing epoch 6
Processing epoch 0
Processing epoch 5
Processing epoch 4
Processing epoch 8
Processing epoch 6
Processing epoch 1
Processing epoch 5
Processing epoch 7
Processing epoch 9
Processing epoch 7
Processing epoch 2
Processing epoch 6
Processing epoch 8
Processing epoch 8
Processing epoch 3
Processing epoch 0
Processing epoch 7
Processing epoch 9
Processing e

In [48]:
svd_params3 = {'n_factors': [50, 100, 150],
              'reg_all': [0, 0.05, 0.1, 0.5, 1],
              'lr_all': [0.0001, 0.001, .005],
              'n_epochs': [50, 100, 150]
              }
g_s_svd3 = GridSearchCV(SVD, param_grid=svd_params3,n_jobs=-2)
g_s_svd3.fit(data)
print(g_s_svd3.best_score)
print(g_s_svd3.best_params)

KeyboardInterrupt: 

In [None]:
svd_params4 = {'n_factors': [75, 100, 125, 150],
               'reg_all': [0.75, 1, 1.25],
               'lr_all': [0.002, 0.003, 0.004, 0.005],
               'n_epochs': [150, 175, 200, 225]
               }
g_s_svd4 = GridSearchCV(SVD, param_grid=svd_params4,n_jobs=-2)
g_s_svd4.fit(data)
print(g_s_svd4.best_score)
print(g_s_svd4.best_params)

{'rmse': 0.3536282096314266, 'mae': 0.2680649403631807}
{'rmse': {'n_factors': 100, 'reg_all': 1, 'lr_all': 0.002, 'n_epochs': 150}, 'mae': {'n_factors': 100, 'reg_all': 0.75, 'lr_all': 0.002, 'n_epochs': 150}}


In [None]:
svd_params5 = {'n_factors': [130, 140, 150, 160],
               'reg_all': [0.8, 1, 1.2, 1.5],
               'lr_all': [0.0005, 0.001, 0.0025, 0.005],
               'n_epochs': [150]  
               }
g_s_svd5 = GridSearchCV(SVD, param_grid=svd_params5,n_jobs=-1)
g_s_svd5.fit(data)
print(g_s_svd5.best_score)
print(g_s_svd5.best_params)

{'rmse': 0.3531651557037855, 'mae': 0.26774912542988527}
{'rmse': {'n_factors': 140, 'reg_all': 0.8, 'lr_all': 0.0005, 'n_epochs': 150}, 'mae': {'n_factors': 150, 'reg_all': 0.8, 'lr_all': 0.0005, 'n_epochs': 150}}


In [50]:
svdpp_params = {'n_factors': [125, 150, 175],
                'reg_all': [0.8, 1, 1.2, 1.5],
                'n_epochs': [150, 200],
                'lr_all': [0.0005, 0.001, 0.002]
}

svdpp_params = GridSearchCV(SVDpp, param_grid=svdpp_params,n_jobs=-1)
svdpp_params.fit(data)
print(svdpp_params.best_score)
print(svdpp_params.best_params)

{'rmse': 0.33691090599538503, 'mae': 0.2606994535659975}
{'rmse': {'n_factors': 150, 'reg_all': 1.2, 'n_epochs': 200, 'lr_all': 0.001}, 'mae': {'n_factors': 175, 'reg_all': 0.8, 'n_epochs': 200, 'lr_all': 0.002}}


In [51]:
svdpp_params1 = {'n_factors': [130, 140, 150],
                'reg_all': [1, 1.1, 1.2],
                'n_epochs': [200, 250, 300],
                'lr_all': [0.001]
}

svdpp_params1 = GridSearchCV(SVDpp, param_grid=svdpp_params1,n_jobs=-1)
svdpp_params1.fit(data)
print(svdpp_params1.best_score)
print(svdpp_params1.best_params)

{'rmse': 0.33667296038507094, 'mae': 0.26059682441668675}
{'rmse': {'n_factors': 150, 'reg_all': 1.2, 'n_epochs': 300, 'lr_all': 0.001}, 'mae': {'n_factors': 140, 'reg_all': 1, 'n_epochs': 300, 'lr_all': 0.001}}
