In [1]:
import pybaseball as pyb
import pandas as pd

In [2]:
data = pyb.statcast(start_dt="2024-03-20", end_dt="2024-11-02")

This is a large query, it may take a moment to complete


100%|██████████| 228/228 [00:44<00:00,  5.14it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


In [4]:
data.to_csv("data_2024.csv")

In [48]:
pd.set_option('display.max_columns', None)

# Filter the DataFrame for times 1, 2, and 3 through the order
data = data[data['n_thruorder_pitcher'].isin([1, 2, 3])]

# Ensure the pitch_type and n_thruorder_pitcher columns are correctly typed
data['pitch_type'] = data['pitch_type'].astype(str)
data['n_thruorder_pitcher'] = pd.to_numeric(data['n_thruorder_pitcher'], errors='coerce')

In [49]:
# Calculate pitch usage per pitcher, time through the order, and pitch type 
pitch_usage_raw = (
    data
    .groupby(['player_name', 'n_thruorder_pitcher', 'pitch_type'])
    .size()
    .reset_index(name='raw_pitch_count')
)

# Calculate total number of pitches per pitcher and time through the order
total_pitches_per_situation = (
    data
    .groupby(['player_name', 'n_thruorder_pitcher'])
    .size()
    .reset_index(name='total_pitches')
)

# Merge to get the total pitch count for each situation
pitch_usage_raw = pitch_usage_raw.merge(total_pitches_per_situation, 
                                        on=['player_name', 'n_thruorder_pitcher'], 
                                        how='left')

# Calculate the pitch usage percentage
pitch_usage_raw['pitch_usage'] = (pitch_usage_raw['raw_pitch_count'] / pitch_usage_raw['total_pitches']) * 100

# Calculate averages for each pitcher and pitch type
pitcher_avg_by_type = (
    data
    .groupby(['player_name', 'pitch_type'])
    .agg({
        'release_speed': 'mean',
        'pfx_x': 'mean',  
        'pfx_z': 'mean'  
    })
    .rename(columns={
        'release_speed': 'avg_release_speed',
        'pfx_x': 'avg_pfx_x',
        'pfx_z': 'avg_pfx_z'
    })
    .reset_index()
)

# Group by pitcher, time through the order, and pitch type to calculate actual averages
pitch_characteristics = (
    data
    .groupby(['player_name', 'n_thruorder_pitcher', 'pitch_type'])
    .agg({
        'release_speed': 'mean',
        'pfx_x': 'mean',
        'pfx_z': 'mean'
    })
    .reset_index()
)

# Merge the pitch averages by pitch type with the grouped data
result = pitch_characteristics.merge(pitcher_avg_by_type, on=['player_name', 'pitch_type'])

# Add deviation columns
result['velocity_diff'] = result['release_speed'] - result['avg_release_speed']
result['horizontal_break_diff'] = result['pfx_x'] - result['avg_pfx_x']
result['vertical_break_diff'] = result['pfx_z'] - result['avg_pfx_z']

# Merge the pitch usage percentages and raw counts into the final result
final_result = result.merge(pitch_usage_raw[['player_name', 'n_thruorder_pitcher', 'pitch_type', 'pitch_usage', 'raw_pitch_count']], 
                            on=['player_name', 'n_thruorder_pitcher', 'pitch_type'], 
                            how='left')

# Sort values by player, time through the order, and pitch type
final_result = final_result.sort_values(by=['player_name', 'n_thruorder_pitcher', 'pitch_type'])

# Convert pitch movement from feet to inches
final_result['pfx_x'] = final_result['pfx_x'] * 12
final_result['pfx_z'] = final_result['pfx_z'] * 12

final_result['avg_pfx_x'] = final_result['avg_pfx_x'] * 12
final_result['avg_pfx_z'] = final_result['avg_pfx_z'] * 12

# Recalculate differences in inches
final_result['horizontal_break_diff'] = final_result['pfx_x'] - final_result['avg_pfx_x']
final_result['vertical_break_diff'] = final_result['pfx_z'] - final_result['avg_pfx_z']

final_result


Unnamed: 0,player_name,n_thruorder_pitcher,pitch_type,release_speed,pfx_x,pfx_z,avg_release_speed,avg_pfx_x,avg_pfx_z,velocity_diff,horizontal_break_diff,vertical_break_diff,pitch_usage,raw_pitch_count
0,"Abbott, Andrew",1,CH,84.780508,14.482373,12.712881,84.752895,14.774842,12.494842,0.027614,-0.292469,0.218039,13.169643,118
1,"Abbott, Andrew",1,CU,80.763725,-9.04,-4.154118,80.811567,-8.781045,-4.062985,-0.047842,-0.258955,-0.091133,11.383929,102
2,"Abbott, Andrew",1,FF,93.154475,9.065837,16.175253,92.786367,8.918685,16.262261,0.368107,0.147152,-0.087009,57.366071,514
3,"Abbott, Andrew",1,SL,85.833333,-4.08,11.4,85.1,-4.512,10.992,0.733333,0.432,0.408,0.334821,3
4,"Abbott, Andrew",1,ST,83.222642,-12.508679,4.289811,82.910773,-12.667447,4.423981,0.311869,0.158768,-0.13417,17.745536,159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8453,"deGrom, Jacob",1,SL,90.211765,4.867059,3.338824,90.007692,5.018462,3.3,0.204072,-0.151403,0.038824,35.051546,34
8454,"deGrom, Jacob",2,CH,90.68,-14.688,4.992,90.51,-14.376,6.384,0.17,-0.312,-1.392,6.666667,5
8455,"deGrom, Jacob",2,CU,79.3,14.04,-8.4,81.08,8.856,-4.392,-1.78,5.184,-4.008,1.333333,1
8456,"deGrom, Jacob",2,FF,96.944,-7.9056,17.6064,97.322785,-8.703797,17.368101,-0.378785,0.798197,0.238299,33.333333,25


In [50]:
final_result[final_result['raw_pitch_count'] > 100].sort_values(by = ['velocity_diff'])

Unnamed: 0,player_name,n_thruorder_pitcher,pitch_type,release_speed,pfx_x,pfx_z,avg_release_speed,avg_pfx_x,avg_pfx_z,velocity_diff,horizontal_break_diff,vertical_break_diff,pitch_usage,raw_pitch_count
3684,"Jones, Jared",3,FF,96.286047,-8.010698,16.689767,97.309816,-8.245399,17.126871,-1.023769,0.234701,-0.437104,42.679901,172
2409,"Francis, Bowden",3,FF,91.876471,-5.652941,17.389412,92.886289,-6.020226,17.871849,-1.009819,0.367285,-0.482437,46.575342,102
8136,"Williams, Gavin",3,FF,95.649074,-10.874444,15.608889,96.65282,-11.257717,15.344154,-1.003746,0.383272,0.264735,45.188285,108
2700,"Gilbert, Logan",3,CU,82.361765,13.551765,-3.996471,83.212813,13.495153,-3.915877,-0.851049,0.056612,-0.080593,12.439024,102
2706,"Gilbert, Logan",3,SL,87.593031,0.880557,-0.651429,88.441882,0.981965,-0.047032,-0.848851,-0.101407,-0.604397,35.000000,287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3676,"Jones, Jared",1,FF,98.102673,-8.653096,17.220401,97.309816,-8.245399,17.126871,0.792857,-0.407697,0.09353,55.295567,449
2689,"Gilbert, Logan",1,FS,85.291262,-6.24699,0.068738,84.495802,-5.89363,0.475259,0.79546,-0.353361,-0.406521,9.338169,103
8328,"Woods Richardson, Simeon",3,FF,93.890977,-5.197895,17.81594,93.072403,-5.808961,17.46026,0.818575,0.611066,0.35568,42.088608,133
4632,"Manaea, Sean",1,FF,93.204138,12.38731,12.934345,92.377401,12.176271,13.077288,0.826737,0.211039,-0.142943,10.154062,145


In [7]:
import pandas as pd

data = pd.read_csv("/Users/ajaypatel/Downloads/PythonFiles/baseball/PitchUsageApp/data_2024.csv")

In [None]:
data['delta_pitcher_run_exp']

In [21]:
data[(data['pitcher'] == 676664) & (data['pitch_type'] == 'FF')].groupby(['n_thruorder_pitcher']).agg({'delta_pitcher_run_exp':['mean', 'count']}).reset_index()

Unnamed: 0_level_0,n_thruorder_pitcher,delta_pitcher_run_exp,delta_pitcher_run_exp
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,1,-0.022455,506
1,2,0.000911,327
2,3,0.010802,197
3,4,-0.010286,7


In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Fetch the data from PyBaseball
data = pd.read_csv('data_2024.csv')
# Filter the DataFrame for times 1, 2, and 3 through the order
data = data[data['n_thruorder_pitcher'].isin([1, 2, 3])]

# Remove rows where pitch_type is 'None'
data = data[data['pitch_type'] != 'None']

# Remove rows with NaN values in the 'pitch_type' column
data = data.dropna(subset=['pitch_type'])

# Remove pitchers who only have n_thruorder_pitcher == 1
# First, group by player_name and check if the pitcher has more than just n_thruorder_pitcher == 1
pitchers_with_multiple_times = data.groupby('player_name')['n_thruorder_pitcher'].nunique()
valid_pitchers = pitchers_with_multiple_times[pitchers_with_multiple_times > 1].index

# Filter the data to only include these valid pitchers
data = data[data['player_name'].isin(valid_pitchers)]

# Ensure the pitch_type and n_thruorder_pitcher columns are correctly typed
data['pitch_type'] = data['pitch_type'].astype(str)
data['n_thruorder_pitcher'] = pd.to_numeric(data['n_thruorder_pitcher'], errors='coerce')

# Calculate pitch usage per pitcher, time through the order, and pitch type (raw count)
pitch_usage_raw = (
    data
    .groupby(['player_name', 'n_thruorder_pitcher', 'pitch_type'])
    .size()
    .reset_index(name='raw_pitch_count')
)

# Calculate total number of pitches per pitcher and time through the order
total_pitches_per_situation = (
    data
    .groupby(['player_name', 'n_thruorder_pitcher'])
    .size()
    .reset_index(name='total_pitches')
)

# Merge to get the total pitch count for each situation
pitch_usage_raw = pitch_usage_raw.merge(total_pitches_per_situation, 
                                        on=['player_name', 'n_thruorder_pitcher'], 
                                        how='left')

# Calculate the pitch usage percentage
pitch_usage_raw['pitch_usage'] = (pitch_usage_raw['raw_pitch_count'] / pitch_usage_raw['total_pitches']) * 100

# Calculate averages for each pitcher *and* pitch type
pitcher_avg_by_type = (
    data
    .groupby(['player_name', 'pitch_type'])
    .agg({
        'release_speed': 'mean',
        'pfx_x': 'mean',  # Horizontal break
        'pfx_z': 'mean'   # Vertical break
    })
    .rename(columns={
        'release_speed': 'avg_release_speed',
        'pfx_x': 'avg_pfx_x',
        'pfx_z': 'avg_pfx_z'
    })
    .reset_index()
)

# Group by pitcher, time through the order, and pitch type to calculate actual averages
pitch_characteristics = (
    data
    .groupby(['player_name', 'n_thruorder_pitcher', 'pitch_type'])
    .agg({
        'delta_pitcher_run_exp': 'mean',
        'release_speed': 'mean',
        'pfx_x': 'mean',
        'pfx_z': 'mean'
    })
    .reset_index()
)

print(pitch_characteristics[pitch_characteristics['player_name'] == 'Sears, JP'])

     player_name  n_thruorder_pitcher pitch_type  delta_pitcher_run_exp  \
5105   Sears, JP                    1         CH               0.018104   
5106   Sears, JP                    1         FF              -0.022455   
5107   Sears, JP                    1         SI               0.010055   
5108   Sears, JP                    1         SL               0.005405   
5109   Sears, JP                    1         ST               0.008474   
5110   Sears, JP                    2         CH              -0.018244   
5111   Sears, JP                    2         FF               0.000911   
5112   Sears, JP                    2         SI               0.029200   
5113   Sears, JP                    2         SL               0.009687   
5114   Sears, JP                    2         ST               0.008681   
5115   Sears, JP                    3         CH              -0.000563   
5116   Sears, JP                    3         FF               0.010802   
5117   Sears, JP         