In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression

file_path = r"C:\Users\brend\OneDrive - Stonehill College\All Cape League Trackman.xlsx"
data = pd.read_excel(file_path)

In [2]:
pitch_type_map = {
    'Fastball': 'Four-Seam',
    'FourSeamFastBall': 'Four-Seam',
    'FourSeamFour-Seam': 'Four-Seam',

    'Changeup': 'ChangeUp',
    
    'TwoSeamFastBall': 'Two-Seam',
    'TwoSeamFour-Seam': 'Two-Seam',

    'ChangeUp': 'ChangeUp',
    'Curveball': 'Curveball',
    'Slider': 'Slider',
    'Sweeper': 'Slider',
    'Cutter': 'Cutter',
    'Sinker': 'Sinker',
    'Splitter': 'Splitter',
    'Knuckleball': 'Knuckleball',

    'Other': 'Other',
    'Undefined': 'Undefined'
}

data['TaggedPitchType'] = data['TaggedPitchType'].replace(pitch_type_map)

print(data['TaggedPitchType'].unique())

['ChangeUp' 'Curveball' 'Four-Seam' 'Slider' 'Cutter' 'Sinker' 'Other'
 'Undefined' 'Two-Seam' 'Knuckleball' 'Splitter']


In [8]:
def called_strike_percentage_vec(df):
    total = df.shape[0]
    return df['PitchCall'].eq('StrikeCalled').sum() / total if total > 0 else np.nan

def first_pitch_strike_vec(df):
    if 'PlayID' not in df.columns:
        pa_groups = df.groupby(['Pitcher','Batter','PAofInning'])
    else:
        pa_groups = df.groupby('PlayID')

    first_pitches = pa_groups.first()
    strike_calls = ['StrikeCalled', 'StrikeSwinging', 'FoulBallNotFieldable', 'FoulBallFieldable']
    strikes = first_pitches['PitchCall'].isin(strike_calls).sum()
    return strikes / first_pitches.shape[0] if first_pitches.shape[0] > 0 else np.nan

def get_hvz_zones(pitch_type, handedness):
    zones = []
    if pitch_type in ['FourSeam']:
        zones = [(-0.85, 0.69, 2.9, 3.9)]
    elif pitch_type in ['Slider', 'Curveball']:
        zones = [(-0.91, -0.15, .85, 1.8)] if handedness == 'Right' else [(0.15, 0.75, .85, 1.8)]
    elif pitch_type in ['ChangeUp', 'Sinker', 'TwoSeam', 'Splitter']:
        zones = [(0.15, 0.75, .85, 1.8)] if handedness == 'Right' else [(-0.91, -0.15, .85, 1.8)]
    elif pitch_type in ['Cutter']:
        zones = [(-0.89, -0.55, 2.75, 3.7)] if handedness == 'Right' else [(0.29, 0.73, 2.75, 3.7)]
    return zones

def high_value_zone_percentage_vec(df):
    if df.shape[0] == 0:
        return np.nan
    count_in_zone = []
    for _, row in df.iterrows():
        zones = get_hvz_zones(row['TaggedPitchType'], row['PitcherThrows'])
        in_zone = any(
            xmin <= row['PlateLocSide'] <= xmax and zmin <= row['PlateLocHeight'] <= zmax
            for xmin, xmax, zmin, zmax in zones
        )
        count_in_zone.append(in_zone)
    return np.mean(count_in_zone)

def hard_contact_percentage_vec(df, hard_ev_threshold=93.5):
    in_play = df['PitchCall'] == 'InPlay'
    if in_play.sum() == 0:
        return np.nan
    return (df.loc[in_play, 'ExitSpeed'] >= hard_ev_threshold).mean()

def controlled_count_percentage_vec(df):
    df = df.copy()
    if 'PlayID' in df.columns:
        pa_groups = df.groupby('PlayID')
    else:
        pa_groups = df.groupby(['GameUID','Inning','Top/Bottom','PAofInning'])
    def pa_controlled(pa):
        counts = pa['Balls'].astype(str) + '-' + pa['Strikes'].astype(str)
        return int(not any(counts.isin(['2-0','3-0','3-1'])))
    controlled = pa_groups.apply(pa_controlled, include_groups=False)
    return controlled.mean()

In [9]:
pitchers = data['Pitcher'].unique()
fip_list = []
for p in pitchers:
    df_p = data[data['Pitcher'] == p]
    HR = df_p[df_p['PlayResult'] == 'HomeRun'].shape[0]
    BB = df_p[df_p['KorBB'] == 'Walk'].shape[0]
    K  = df_p[df_p['KorBB'] == 'Strikeout'].shape[0]
    HBP = df_p[df_p['PitchCall'] == 'HitByPitch'].shape[0]
    total_outs = df_p['OutsOnPlay'].sum() + K
    IP = max(total_outs / 3, 1)
    FIP = ((6.3*HR) + (((4.47+0.983)/2)*(BB+HBP)) + (-2.1*K)) / IP + 5.7
    fip_list.append({'Pitcher': p, 'FIP': FIP})
fip_df = pd.DataFrame(fip_list)

In [14]:
feature_list = []
for p, df_p in data.groupby('Pitcher'):
    feature_list.append({
        'Pitcher': p,
        'CS': called_strike_percentage_vec(df_p),
        'CC': controlled_count_percentage_vec(df_p),
        'F_Strike': first_pitch_strike_vec(df_p),
        'HVZ': high_value_zone_percentage_vec(df_p),
        'Hard_Contact': hard_contact_percentage_vec(df_p)
    })
features_df = pd.DataFrame(feature_list).merge(fip_df, on='Pitcher').dropna()

In [15]:
features = ["CS", "CC", "F_Strike", "HVZ", "Hard_Contact"]
X = features_df[features].copy()
y = features_df["FIP"].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
xgb_model = XGBRegressor(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42)
xgb_model.fit(X_scaled, y)

importances = xgb_model.feature_importances_
weights = importances / importances.sum()
print("Feature Weights:", dict(zip(features, weights)))

Feature Weights: {'CS': np.float32(0.08521169), 'CC': np.float32(0.09395039), 'F_Strike': np.float32(0.4756854), 'HVZ': np.float32(0.17919612), 'Hard_Contact': np.float32(0.1659564)}


In [16]:
features_df["LocationRaw"] = np.dot(X_scaled, weights)
league_mean = features_df["LocationRaw"].mean()
league_std = features_df["LocationRaw"].std()
features_df["Location+"] = 100 + (features_df["LocationRaw"] - league_mean) / league_std * 15

pitcher_teams = data[['Pitcher', 'PitcherTeam']].drop_duplicates(subset='Pitcher')
features_df = features_df.merge(pitcher_teams, on='Pitcher', how='left')

export_cols = ["Pitcher", "PitcherTeam", "Location+", "LocationRaw", "CS", "CC", "F_Strike", "HVZ", "Hard_Contact", "FIP"]
features_df[export_cols].to_excel("LocationPlus_All.xlsx", index=False)

pitchtype_list = []
for (p, pitch_type), df_pt in data.groupby(["Pitcher", "TaggedPitchType"]):
    if df_pt.empty:
        continue
    row = {
        "Pitcher": p,
        "PitchType": pitch_type,
        "CS": called_strike_percentage_vec(df_pt),
        "CC": controlled_count_percentage_vec(df_pt),
        "F_Strike": first_pitch_strike_vec(df_pt),
        "HVZ": high_value_zone_percentage_vec(df_pt),
        "Hard_Contact": hard_contact_percentage_vec(df_pt)
    }
    pitchtype_list.append(row)

pitchtype_df = pd.DataFrame(pitchtype_list).dropna()
X_pitchtype = pitchtype_df[features].copy()
X_pitchtype_scaled = scaler.transform(X_pitchtype)
pitchtype_df["LocationRaw"] = np.dot(X_pitchtype_scaled, weights)

league_mean_pt = pitchtype_df["LocationRaw"].mean()
league_std_pt = pitchtype_df["LocationRaw"].std()
pitchtype_df["Location+"] = 100 + (pitchtype_df["LocationRaw"] - league_mean_pt) / league_std_pt * 15

pitchtype_df = pitchtype_df.merge(pitcher_teams, on="Pitcher", how="left")

In [17]:
export_cols_pt = ["Pitcher", "PitcherTeam", "PitchType", "Location+", "LocationRaw", "CS", "CC", "F_Strike", "HVZ", "Hard_Contact"]
pitchtype_df[pitchtype_df["PitcherTeam"] == "FAL_COM"][export_cols_pt].to_excel("LocationPlus_FALCOM_PitchType.xlsx", index=False)