In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import os

INPUT_XLSX = r"C:\Users\brend\OneDrive - Stonehill College\All Cape League Trackman.xlsx"
SHEET_NAME = 0
OUTPUT_XLSX = "catch_probability_results.xlsx"
MIN_OPPORTUNITY = 2
MAX_ASN = 80
MIN_HIT_LAND_CONF = 0.0  
RANDOM_STATE = 42
TEST_SIZE = 0.20

print("Loading:", INPUT_XLSX)
df = pd.read_excel(INPUT_XLSX, sheet_name=SHEET_NAME, engine="openpyxl")

print("Rows loaded:", len(df))

Loading: C:\Users\brend\OneDrive - Stonehill College\All Cape League Trackman.xlsx
Rows loaded: 55824


In [24]:
def col_exists(c):
    return c in df.columns

if not (col_exists("Distance") and col_exists("Bearing")):
    raise ValueError("Your file must contain 'Distance' and 'Bearing' columns")

pos_list = []
for pos in ("LF", "CF", "RF"):
    xcol = f"{pos}_PositionAtReleaseX"
    zcol = f"{pos}_PositionAtReleaseZ"
    namecol = f"{pos}_Name"
    if col_exists(xcol) and col_exists(zcol):
        pos_list.append({
            "pos": pos,
            "xcol": xcol,
            "zcol": zcol,
            "namecol": namecol if col_exists(namecol) else None
        })

if len(pos_list) == 0:
    raise ValueError(
        "No LF/CF/RF position columns found. Expected columns like "
        "'LF_PositionAtReleaseX' and 'LF_PositionAtReleaseZ'."
    )

df["Distance"] = pd.to_numeric(df["Distance"], errors="coerce")
df["Bearing"]  = pd.to_numeric(df["Bearing"],  errors="coerce")

df["BearingRadians"] = np.radians(df["Bearing"])
df["Ball_X"] = df["Distance"] * np.cos(df["BearingRadians"])
df["Ball_Z"] = df["Distance"] * np.sin(df["BearingRadians"])

In [25]:
fielder_x_cols = [p["xcol"] for p in pos_list]
fielder_z_cols = [p["zcol"] for p in pos_list]

for c in fielder_x_cols + fielder_z_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

fielder_x = df[fielder_x_cols].to_numpy(dtype=float)
fielder_z = df[fielder_z_cols].to_numpy(dtype=float)

ball_x = df["Ball_X"].to_numpy(dtype=float)[:, None]
ball_z = df["Ball_Z"].to_numpy(dtype=float)[:, None]

distances = np.sqrt((ball_x - fielder_x)**2 + (ball_z - fielder_z)**2)

valid_mask = ~np.isnan(distances).all(axis=1)

n = len(df)
nearest_idx = np.full(n, np.nan)
fielder_X_vals = np.full(n, np.nan)
fielder_Z_vals = np.full(n, np.nan)
fielder_pos_vals = np.full(n, None, dtype=object)
fielder_name_vals = np.full(n, None, dtype=object)

nearest_idx[valid_mask] = np.nanargmin(distances[valid_mask], axis=1)

for i, p in enumerate(pos_list):
    mask = (nearest_idx == i)
    fielder_X_vals[mask] = df[p["xcol"]].to_numpy(dtype=float)[mask]
    fielder_Z_vals[mask] = df[p["zcol"]].to_numpy(dtype=float)[mask]
    fielder_pos_vals[mask] = p["pos"]
    if p["namecol"] is not None:
        names = df[p["namecol"]].astype(str)
        fielder_name_vals[mask] = names.to_numpy()[mask]

df["Fielder_X"] = fielder_X_vals
df["Fielder_Z"] = fielder_Z_vals
df["NearestFielderPos"] = fielder_pos_vals
df["NearestFielderName"] = fielder_name_vals
df["FielderDistance"] = np.nanmin(distances, axis=1)

  df["FielderDistance"] = np.nanmin(distances, axis=1)


In [26]:
if not (col_exists("ZoneTime") and col_exists("HangTime")):
    raise ValueError("Your file must contain 'ZoneTime' and 'HangTime' columns")

df["ZoneTime"] = pd.to_numeric(df["ZoneTime"], errors="coerce")
df["HangTime"] = pd.to_numeric(df["HangTime"], errors="coerce")
df["OpportunityTime"] = df["ZoneTime"] + df["HangTime"]

denom = df["OpportunityTime"] - 2.0
df["ASN"] = np.where(denom > 0, df["FielderDistance"] / denom, np.nan)

In [27]:
fielder_angles = np.degrees(np.arctan2(df["Fielder_Z"].to_numpy(dtype=float), df["Fielder_X"].to_numpy(dtype=float)))
ball_angles   = np.degrees(np.arctan2(df["Ball_Z"].to_numpy(dtype=float), df["Ball_X"].to_numpy(dtype=float)))

angle_diff = ball_angles - fielder_angles
angle_diff = (angle_diff + 180) % 360 - 180
angle_diff = np.abs(angle_diff)

df["IsStraightBack"] = angle_diff >= 150

df["DA_ASN"] = df["ASN"] + np.where(df["IsStraightBack"], 3.56, 0.0)

In [28]:
caught = None
if col_exists("TaggedHitType") and col_exists("PlayResult"):
    tht = df["TaggedHitType"].astype(str).str.lower()
    pr  = df["PlayResult"].astype(str).str.lower()
    caught = ((tht.str.contains("fly", na=False) | tht.str.contains("line", na=False) | tht.str.contains("pop", na=False))
              & pr.str.contains("out", na=False)
             ).astype(int)
elif col_exists("PlayResult"):
    pr = df["PlayResult"].astype(str).str.lower()
    caught = (pr.str.contains("fly", na=False) | pr.str.contains("line", na=False) | pr.str.contains("pop", na=False)).astype(int)
else:
    raise ValueError("No columns to determine 'caught' label. Expected 'TaggedHitType' and 'PlayResult' or at least 'PlayResult'.")

df["Caught"] = caught

In [29]:
if col_exists("HitLandingConfidence") and MIN_HIT_LAND_CONF > 0:
    df["HitLandingConfidence"] = pd.to_numeric(df["HitLandingConfidence"], errors="coerce")
    conf_mask = df["HitLandingConfidence"] >= MIN_HIT_LAND_CONF
else:
    conf_mask = pd.Series(True, index=df.index)

mask = (
    (df["OpportunityTime"] > MIN_OPPORTUNITY) &
    (df["DA_ASN"].notna()) &
    (df["DA_ASN"] >= 0) &
    (df["DA_ASN"] <= MAX_ASN) &
    (df["FielderDistance"].notna()) &
    (df["FielderDistance"] >= 0) &
    conf_mask
)

df_filtered = df.loc[mask].copy()
print(f"Rows before filtering: {len(df)}, after filtering: {len(df_filtered)}")

if len(df_filtered) < 50:
    print("Warning: fewer than 50 rows after basic filtering. Consider loosening filters for model training.")

Rows before filtering: 55824, after filtering: 2199


In [30]:
df_filtered["DA_ASN_sq"] = df_filtered["DA_ASN"] ** 2

df_model = df_filtered.dropna(subset=["DA_ASN", "DA_ASN_sq", "Caught"])
print("Rows used for modeling:", len(df_model))

X = df_model[["DA_ASN", "DA_ASN_sq"]].to_numpy(dtype=float)
y = df_model["Caught"].to_numpy(dtype=int)

Rows used for modeling: 2199


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y if len(np.unique(y))>1 else None)

model = LogisticRegression(solver="lbfgs", max_iter=2000)
model.fit(X_train, y_train)

intercept = float(model.intercept_[0])
coef1 = float(model.coef_[0][0])
coef2 = float(model.coef_[0][1])
print("\n✅ Trained logistic regression coefficients:")
print(f"Intercept: {intercept:.6f}")
print(f"DA_ASN:    {coef1:.6f}")
print(f"DA_ASN^2:  {coef2:.6f}")
print(f"\nModel equation (probability):")
print(f"    p = 1 / (1 + exp(-1 * ({intercept:.4f} + {coef1:.4f}*x + {coef2:.4f}*x^2)))")


✅ Trained logistic regression coefficients:
Intercept: 4.752943
DA_ASN:    -0.200912
DA_ASN^2:  0.001591

Model equation (probability):
    p = 1 / (1 + exp(-1 * (4.7529 + -0.2009*x + 0.0016*x^2)))


In [32]:
df_model["Predicted_CatchProb"] = model.predict_proba(df_model[["DA_ASN", "DA_ASN_sq"]])[:, 1]

cols_to_export = [
    "Date", "Time", "PlayID", "PitchUID", "Batter", "FielderDistance", "OpportunityTime",
    "DA_ASN", "DA_ASN_sq", "IsStraightBack", "NearestFielderPos", "NearestFielderName",
    "Caught", "Predicted_CatchProb"
]
cols_to_export = [c for c in cols_to_export if c in df_model.columns]

accuracy = model.score(X_test, y_test)
print(f"\nHoldout accuracy: {accuracy:.3f}")


Holdout accuracy: 0.816




In [33]:
out_df = df_model[cols_to_export].copy()
out_path = os.path.join(os.path.dirname(INPUT_XLSX), OUTPUT_XLSX)
out_df.to_excel(out_path, index=False)
print(f"\n📁 Results saved to: {out_path}")
print("Saved columns:", out_df.columns.tolist())


📁 Results saved to: C:\Users\brend\OneDrive - Stonehill College\catch_probability_results.xlsx
Saved columns: ['Date', 'Time', 'PlayID', 'PitchUID', 'Batter', 'FielderDistance', 'OpportunityTime', 'DA_ASN', 'DA_ASN_sq', 'IsStraightBack', 'NearestFielderPos', 'NearestFielderName', 'Caught', 'Predicted_CatchProb']
