In [32]:
import polars as pl
import pandas as pd
import numpy as np
import pybaseball as pb


In [None]:
p1 = 605400 #Aaron Nola
p2 = 668873 #Caleb Killian

p1_data = pb.statcast_pitcher("2024-09-23", "2024-09-23", player_id=p1)
p2_data = pb.statcast_pitcher("2024-09-23", "2024-09-23", player_id=p2)
pitch_data = (pl.from_pandas(pd.concat([p1_data, p2_data]))
                .with_columns(pl.concat_str([pl.col("game_pk"),
                                            pl.col("home_team"),
                                            pl.col("pitcher"),
                                            pl.col("at_bat_number"),
                                            pl.col("pitch_number")],
                              separator="-").alias("pitch_id")))


In [37]:
info_features = ["game_date", "game_pk", "home_team", "away_team", "player_name"]
event_features = ["events", "description", "type", "bb_type", "balls", "strikes", "inning", "outs_when_up", "des", "batter", "pitcher"]
release_pitcher_features = ["release_pos_x", "release_pos_y", "release_pos_z", "release_extension", "arm_angle"]
release_ball_features = ["release_speed", "release_spin", "spin_axis"]
traj_ball_features = ["pfx_x", "pfx_z", "vx0", "vy0", "vz0", "ax", "ay", "az"]
plate_ball_features = ["plate_x", "plate_z"]
pitch_id = ["pitch_id"]

In [38]:
release_frame = pitch_data.select(release_pitcher_features + ["pitcher", "pitch_type", "pitch_id"])

In [42]:
release_frame_agg = (
    release_frame.group_by(['pitcher', 'pitch_type'])
                .agg([
                    pl.col('release_pos_x').mean().alias('release_pos_x_mean'),
                    pl.col('release_pos_x').std().alias('release_pos_x_std'),
                    pl.col('release_pos_y').mean().alias('release_pos_y_mean'),
                    pl.col('release_pos_y').std().alias('release_pos_y_std'),
                    pl.col('release_pos_z').mean().alias('release_pos_z_mean'),
                    pl.col('release_pos_z').std().alias('release_pos_z_std'),
                    pl.col('release_extension').mean().alias('release_extension_mean'),
                    pl.col('release_extension').std().alias('release_extension_std'),
                    pl.col('arm_angle').mean().alias('arm_angle_mean'),
                    pl.col('arm_angle').std().alias('arm_angle_std'),
                    pl.len().alias('count')
                ])
)


In [None]:
release_frame_agg

In [None]:
vals = release_frame.filter(pl.col("pitch_type") == "KC").filter(pl.col("pitcher") == 605400).select("release_pos_z")


plot_histogram(vals, bins=10)

In [None]:
vals1 = release_frame.filter(pl.col("pitch_type") == "KC").filter(pl.col("pitcher") == 605400).select("release_pos_y")
vals2 = release_frame.filter(pl.col("pitch_type") == "KC").filter(pl.col("pitcher") == 605400).select("release_pos_x")

plot_scatter(vals1, vals2)


In [71]:
import polars as pl

def distance_stats_from_centroid(df, x_col, y_col, z_col, group_cols):
    # Create expressions for the centroid calculations
    centroid_x = pl.col(x_col).mean().alias("centroid_x")
    centroid_y = pl.col(y_col).mean().alias("centroid_y")
    centroid_z = pl.col(z_col).mean().alias("centroid_z")
    
    # Expression for calculating Euclidean distance
    distance_expr = (
        ((pl.col(x_col) - pl.col("centroid_x")) ** 2 +
         (pl.col(y_col) - pl.col("centroid_y")) ** 2 +
         (pl.col(z_col) - pl.col("centroid_z")) ** 2)
        .sqrt()
        .alias("distance")
    )
    
    # Group, calculate centroids, distances, mean and standard deviation
    result = (
        df
        .group_by(group_cols)
        .agg(
            centroid_x,
            centroid_y,
            centroid_z
        )
        .join(df, on=group_cols)
        .with_columns([
            distance_expr
        ])
        .group_by(group_cols)
        .agg([
            pl.col("distance").mean().alias("mean_distance"),
            pl.col("distance").std().alias("std_dev_distance")
        ])
    )
    
    return result

# Example usage:
# data = pl.DataFrame({
#     'x': [1, 2, 3, 4, 5, 6],
#     'y': [7, 8, 9, 10, 11, 12],
#     'z': [13, 14, 15, 16, 17, 18],
#     'group1': ['A', 'A', 'A', 'B', 'B', 'B'],
#     'group2': [1, 1, 2, 2, 3, 3]
# })
# result = distance_stats_from_centroid(data, 'x', 'y', 'z', ['group1', 'group2'])

In [72]:
res = distance_stats_from_centroid(pitch_data, "release_pos_x", "release_pos_y", "release_pos_z", ["pitcher", "pitch_type"])

In [73]:
res

pitcher,pitch_type,mean_distance,std_dev_distance
i64,str,f64,f64
605400,"""FF""",0.107918,0.052181
668873,"""ST""",0.10518,0.043128
668873,"""FC""",0.175183,0.07962
605400,"""CH""",0.113059,0.047687
668873,"""SI""",0.069749,0.02074
…,…,…,…
668873,"""CH""",0.151658,4.0222e-16
668873,"""FF""",0.137219,0.053715
605400,"""FC""",0.098391,0.045379
668873,"""KC""",0.076831,0.045291
