In [15]:
import polars as pl
import pandas as pd
import numpy as np
import pybaseball as pb


In [16]:
p1 = 605400 #Aaron Nola
p2 = 668873 #Caleb Killian

p1_data = pb.statcast_pitcher("2024-09-23", "2024-09-23", player_id=p1)
p2_data = pb.statcast_pitcher("2024-09-23", "2024-09-23", player_id=p2)
pitch_data = (pl.from_pandas(pd.concat([p1_data, p2_data]))
                .with_columns(pl.concat_str([pl.col("game_pk"),
                                            pl.col("home_team"),
                                            pl.col("pitcher"),
                                            pl.col("at_bat_number"),
                                            pl.col("pitch_number")],
                              separator="-").alias("pitch_id")))


Gathering Player Data
Gathering Player Data


In [17]:
info_features = ["game_date", "game_pk", "home_team", "away_team", "player_name"]
event_features = ["events", "description", "type", "bb_type", "balls", "strikes", "inning", "outs_when_up", "des", "batter", "pitcher"]
release_pitcher_features = ["release_pos_x", "release_pos_y", "release_pos_z", "release_extension", "arm_angle"]
release_ball_features = ["release_speed", "release_spin", "spin_axis"]
traj_ball_features = ["pfx_x", "pfx_z", "vx0", "vy0", "vz0", "ax", "ay", "az"]
plate_ball_features = ["plate_x", "plate_z"]
pitch_id = ["pitch_id"]

In [18]:
release_frame = pitch_data.select(release_pitcher_features + ["pitcher", "pitch_type", "pitch_id"])

In [19]:
release_frame_agg

NameError: name 'release_frame_agg' is not defined

In [None]:
vals = release_frame.filter(pl.col("pitch_type") == "KC").filter(pl.col("pitcher") == 605400).select("release_pos_z")


plot_histogram(vals, bins=10)

In [None]:
vals1 = release_frame.filter(pl.col("pitch_type") == "KC").filter(pl.col("pitcher") == 605400).select("release_pos_y")
vals2 = release_frame.filter(pl.col("pitch_type") == "KC").filter(pl.col("pitcher") == 605400).select("release_pos_x")

plot_scatter(vals1, vals2)


In [71]:
import polars as pl

def distance_stats_from_centroid(df, x_col, y_col, z_col, group_cols):
    # Create expressions for the centroid calculations
    centroid_x = pl.col(x_col).mean().alias("centroid_x")
    centroid_y = pl.col(y_col).mean().alias("centroid_y")
    centroid_z = pl.col(z_col).mean().alias("centroid_z")
    
    # Expression for calculating Euclidean distance
    distance_expr = (
        ((pl.col(x_col) - pl.col("centroid_x")) ** 2 +
         (pl.col(y_col) - pl.col("centroid_y")) ** 2 +
         (pl.col(z_col) - pl.col("centroid_z")) ** 2)
        .sqrt()
        .alias("distance")
    )
    
    # Group, calculate centroids, distances, mean and standard deviation
    result = (
        df
        .group_by(group_cols)
        .agg(
            centroid_x,
            centroid_y,
            centroid_z
        )
        .join(df, on=group_cols)
        .with_columns([
            distance_expr
        ])
        .group_by(group_cols)
        .agg([
            pl.col("distance").mean().alias("mean_distance"),
            pl.col("distance").std().alias("std_dev_distance")
        ])
    )
    
    return result

# Example usage:
# data = pl.DataFrame({
#     'x': [1, 2, 3, 4, 5, 6],
#     'y': [7, 8, 9, 10, 11, 12],
#     'z': [13, 14, 15, 16, 17, 18],
#     'group1': ['A', 'A', 'A', 'B', 'B', 'B'],
#     'group2': [1, 1, 2, 2, 3, 3]
# })
# result = distance_stats_from_centroid(data, 'x', 'y', 'z', ['group1', 'group2'])

In [72]:
res = distance_stats_from_centroid(pitch_data, "release_pos_x", "release_pos_y", "release_pos_z", ["pitcher", "pitch_type"])

In [None]:
p1_data = pb.statcast_pitcher("2024-09-23", "2024-09-23", player_id=p1)


In [1]:
import pybaseball as pb
import polars as pl


In [None]:
seasons = [2021, 2022, 2023, 2024]
season_list = []

for season in seasons:
    print(season)
    season_frame = pl.from_pandas(pb.pitching_stats_bref(season))
    season_frame = (season_frame
                        .filter((pl.col("Lev") == "Maj-NL") | (pl.col("Lev") == "Maj-AL"))
                        .select(["mlbID", "G", "IP", "ERA", "WHIP", "SO9"])
                        .with_columns(pl.lit(season).alias("season"))
                        .with_columns(pl.col("mlbID").cast(pl.Int64)))
    season_list.append(season_frame)

frame = pl.concat(season_list)
frame = frame.rename({col: col.lower() for col in frame.columns})
players = frame["mlbid"].unique().to_list()
player_info = pb.playerid_reverse_lookup(player_ids=players)
player_info = player_info[["key_mlbam", "name_last", "name_first", "mlb_played_first", "mlb_played_last"]]
player_info = pl.from_pandas(player_info)
player_info = (player_info.with_columns((pl.col("name_first") + " " + pl.col("name_last")).alias("player_name"))
                            .with_columns(pl.col("player_name").str.to_titlecase().alias("player_name")))
player_info = player_info.rename({"key_mlbam": "mlbid"}).select(["mlbid", "player_name", "mlb_played_first", "mlb_played_last"])
frame = frame.join(player_info, on = "mlbid", how = "left")
frame = frame.select(["mlbid", "player_name", "season", "g", "ip", "era", "whip", "so9", "mlb_played_first", "mlb_played_last"])
frame.write_parquet("pitchers.parquet")

In [1]:
import polars as pl
import pybaseball as pb

def get_season_start_end(season: int = 2024) -> list:
    """Returns a list with the first and last day of the regular season. Includes Tokyo + Korea series.""" 

    if season < 2021:
        print("Season must be >= 2021")
        return
    
    if season == 2021:
        return ["2021-04-01", "2021-10-03"]

    if season == 2022:
        return ["2022-04-07", "2022-10-05"]

    if season == 2023:
            return ["2023-03-30", "2023-10-01"]

    if season == 2024:
            return ["2024-03-20", "2024-10-03"]
    
    if season == 2025:
            return ["2025-03-18", "2025-09-28"]
    

In [6]:
pitchers = pl.read_parquet("../src/ninjalytics/data/pitchers.parquet")

In [20]:
from tqdm import tqdm

seasons = [2021, 2022, 2023, 2024]
season_list = []

for season in seasons:
    print(seasons)
    dates = get_season_start_end(season = season)
    start = dates[0]
    stop = dates[1]
    season_pitchers = (pitchers
                        .filter(pl.col("season") == season)
                        .filter(pl.col("ip") >= 20)
                        .select(["mlbid"])
                        .to_series()
                        .to_list())
    season_pitcher_list = []
    
    for pitcher in tqdm(season_pitchers):
        p = pb.statcast_pitcher(start, stop, player_id=pitcher)
        p = pl.from_pandas(p)
        p = p.with_columns(pl.lit(season).alias("season"))
        season_pitcher_list.append(p)

    season_pitcher_frame = pl.concat(season_pitcher_list)
    season_pitcher_frame.write_parquet(f"../src/ninjalytics/data/pitches_{season}.parquet")


    




[2021, 2022, 2023, 2024]


  0%|          | 1/520 [00:00<01:18,  6.63it/s]

Gathering Player Data
Gathering Player Data


  1%|          | 3/520 [00:00<01:24,  6.12it/s]

Gathering Player Data
Gathering Player Data


  1%|          | 4/520 [00:06<21:17,  2.47s/it]

Gathering Player Data


  1%|          | 5/520 [00:10<25:48,  3.01s/it]

Gathering Player Data


  1%|          | 6/520 [00:18<41:01,  4.79s/it]

Gathering Player Data


  1%|▏         | 7/520 [00:24<44:22,  5.19s/it]

Gathering Player Data


  2%|▏         | 8/520 [00:28<39:47,  4.66s/it]

Gathering Player Data


  2%|▏         | 9/520 [00:34<42:43,  5.02s/it]

Gathering Player Data


  2%|▏         | 10/520 [00:38<40:22,  4.75s/it]

Gathering Player Data


  2%|▏         | 11/520 [00:41<37:33,  4.43s/it]

Gathering Player Data


  2%|▏         | 12/520 [00:45<36:09,  4.27s/it]

Gathering Player Data


  2%|▎         | 13/520 [00:49<33:46,  4.00s/it]

Gathering Player Data


  3%|▎         | 14/520 [00:55<38:50,  4.61s/it]

Gathering Player Data


  3%|▎         | 15/520 [01:00<40:57,  4.87s/it]

Gathering Player Data


  3%|▎         | 16/520 [01:04<37:14,  4.43s/it]

Gathering Player Data


  3%|▎         | 17/520 [01:06<32:26,  3.87s/it]

Gathering Player Data


  3%|▎         | 18/520 [01:12<38:07,  4.56s/it]

Gathering Player Data


  4%|▎         | 19/520 [01:17<37:43,  4.52s/it]

Gathering Player Data


  4%|▍         | 20/520 [01:21<36:48,  4.42s/it]

Gathering Player Data


  4%|▍         | 21/520 [01:25<36:32,  4.39s/it]

Gathering Player Data


  4%|▍         | 22/520 [01:29<34:05,  4.11s/it]

Gathering Player Data


  4%|▍         | 23/520 [01:39<49:36,  5.99s/it]

Gathering Player Data


  5%|▍         | 24/520 [01:43<43:22,  5.25s/it]

Gathering Player Data


  5%|▍         | 25/520 [01:46<38:15,  4.64s/it]

Gathering Player Data


  5%|▌         | 26/520 [01:49<34:16,  4.16s/it]

Gathering Player Data


  5%|▌         | 27/520 [01:53<34:42,  4.22s/it]

Gathering Player Data


  5%|▌         | 28/520 [01:57<32:55,  4.01s/it]

Gathering Player Data


  6%|▌         | 29/520 [02:01<33:37,  4.11s/it]

Gathering Player Data


  6%|▌         | 30/520 [02:04<30:28,  3.73s/it]

Gathering Player Data


  6%|▌         | 31/520 [02:07<29:18,  3.60s/it]

Gathering Player Data


  6%|▌         | 32/520 [02:11<29:27,  3.62s/it]

Gathering Player Data


  6%|▋         | 33/520 [02:15<30:20,  3.74s/it]

Gathering Player Data


  7%|▋         | 34/520 [02:21<37:13,  4.60s/it]

Gathering Player Data


  7%|▋         | 35/520 [02:27<38:50,  4.81s/it]

Gathering Player Data


  7%|▋         | 36/520 [02:30<35:44,  4.43s/it]

Gathering Player Data


  7%|▋         | 37/520 [02:35<36:07,  4.49s/it]

Gathering Player Data


  7%|▋         | 38/520 [02:38<32:32,  4.05s/it]

Gathering Player Data


  8%|▊         | 39/520 [02:45<39:10,  4.89s/it]

Gathering Player Data


  8%|▊         | 40/520 [02:48<34:17,  4.29s/it]

Gathering Player Data


  8%|▊         | 41/520 [02:52<33:39,  4.22s/it]

Gathering Player Data


  8%|▊         | 42/520 [02:54<29:56,  3.76s/it]

Gathering Player Data


  8%|▊         | 43/520 [02:58<28:20,  3.57s/it]

Gathering Player Data


  8%|▊         | 44/520 [03:00<26:22,  3.32s/it]

Gathering Player Data


  9%|▊         | 45/520 [03:03<24:56,  3.15s/it]

Gathering Player Data


  9%|▉         | 46/520 [03:06<24:14,  3.07s/it]

Gathering Player Data


  9%|▉         | 47/520 [03:11<30:01,  3.81s/it]

Gathering Player Data


  9%|▉         | 48/520 [03:15<29:48,  3.79s/it]

Gathering Player Data


  9%|▉         | 49/520 [03:19<28:38,  3.65s/it]

Gathering Player Data


 10%|▉         | 50/520 [03:22<28:28,  3.64s/it]

Gathering Player Data


 10%|▉         | 51/520 [03:26<29:49,  3.82s/it]

Gathering Player Data


 10%|█         | 52/520 [03:31<30:48,  3.95s/it]

Gathering Player Data


 10%|█         | 53/520 [03:35<30:41,  3.94s/it]

Gathering Player Data


 10%|█         | 54/520 [03:38<30:23,  3.91s/it]

Gathering Player Data


 11%|█         | 55/520 [03:44<34:24,  4.44s/it]

Gathering Player Data


 11%|█         | 56/520 [03:46<28:51,  3.73s/it]

Gathering Player Data


 11%|█         | 57/520 [03:53<35:17,  4.57s/it]

Gathering Player Data


 11%|█         | 58/520 [04:01<43:21,  5.63s/it]

Gathering Player Data


 11%|█▏        | 59/520 [04:07<43:30,  5.66s/it]

Gathering Player Data


 12%|█▏        | 60/520 [04:10<37:58,  4.95s/it]

Gathering Player Data


 12%|█▏        | 61/520 [04:15<38:09,  4.99s/it]

Gathering Player Data


 12%|█▏        | 62/520 [04:22<41:49,  5.48s/it]

Gathering Player Data


 12%|█▏        | 63/520 [04:25<36:45,  4.83s/it]

Gathering Player Data


 12%|█▏        | 64/520 [04:28<33:02,  4.35s/it]

Gathering Player Data


 12%|█▎        | 65/520 [04:32<32:41,  4.31s/it]

Gathering Player Data


 13%|█▎        | 66/520 [04:36<31:41,  4.19s/it]

Gathering Player Data


 13%|█▎        | 67/520 [04:40<31:10,  4.13s/it]

Gathering Player Data


 13%|█▎        | 68/520 [04:45<31:31,  4.19s/it]

Gathering Player Data


 13%|█▎        | 69/520 [04:49<31:47,  4.23s/it]

Gathering Player Data


 13%|█▎        | 70/520 [04:52<29:08,  3.88s/it]

Gathering Player Data


 14%|█▎        | 71/520 [04:56<30:13,  4.04s/it]

Gathering Player Data


 14%|█▍        | 72/520 [05:01<30:35,  4.10s/it]

Gathering Player Data


 14%|█▍        | 73/520 [05:07<36:35,  4.91s/it]

Gathering Player Data


 14%|█▍        | 74/520 [05:11<34:39,  4.66s/it]

Gathering Player Data


 14%|█▍        | 75/520 [05:16<34:37,  4.67s/it]

Gathering Player Data


 15%|█▍        | 76/520 [05:25<42:48,  5.78s/it]

Gathering Player Data


 15%|█▍        | 77/520 [05:29<39:33,  5.36s/it]

Gathering Player Data


 15%|█▌        | 78/520 [05:33<36:27,  4.95s/it]

Gathering Player Data


 15%|█▌        | 79/520 [05:37<34:45,  4.73s/it]

Gathering Player Data


 15%|█▌        | 80/520 [05:40<31:20,  4.27s/it]

Gathering Player Data


 16%|█▌        | 81/520 [05:45<31:56,  4.37s/it]

Gathering Player Data


 16%|█▌        | 82/520 [05:49<31:50,  4.36s/it]

Gathering Player Data


 16%|█▌        | 83/520 [05:55<35:49,  4.92s/it]

Gathering Player Data


 16%|█▌        | 84/520 [05:59<32:49,  4.52s/it]

Gathering Player Data


 16%|█▋        | 85/520 [06:03<30:40,  4.23s/it]

Gathering Player Data


 17%|█▋        | 86/520 [06:07<31:24,  4.34s/it]

Gathering Player Data


 17%|█▋        | 87/520 [06:12<31:36,  4.38s/it]

Gathering Player Data


 17%|█▋        | 88/520 [06:15<28:21,  3.94s/it]

Gathering Player Data


 17%|█▋        | 88/520 [06:18<30:58,  4.30s/it]


ParserError: Error tokenizing data. C error: Expected 1 fields in line 13, saw 2


In [9]:

season_pitchers = (pitchers
                    .filter(pl.col("season") == 2022)
                    .filter(pl.col("ip") >= 20)
                    .select(["mlbid"])
                    .to_series()
                    .to_list())

In [12]:
season_pitchers.to_series().to_list()

[676265,
 656061,
 650556,
 642758,
 592094,
 672851,
 669211,
 645261,
 669920,
 641302,
 663465,
 622075,
 621237,
 502624,
 666120,
 542881,
 644364,
 502042,
 685503,
 676879,
 668933,
 665871,
 641329,
 621383,
 453268,
 669618,
 605130,
 606930,
 598264,
 642545,
 605135,
 657508,
 642585,
 669358,
 670280,
 595881,
 656222,
 571479,
 678394,
 621244,
 641360,
 669456,
 656234,
 621389,
 621112,
 542947,
 621366,
 502202,
 680694,
 666374,
 518489,
 623211,
 605154,
 689225,
 656257,
 641401,
 686613,
 664141,
 677865,
 663460,
 621111,
 518516,
 607481,
 605164,
 656271,
 669203,
 665795,
 650893,
 471911,
 641447,
 528748,
 650895,
 666721,
 612434,
 656302,
 570666,
 468504,
 605177,
 547973,
 608638,
 643256,
 518553,
 542585,
 650644,
 664199,
 661403,
 605182,
 502171,
 543037,
 669395,
 517008,
 672710,
 571578,
 641482,
 676710,
 622503,
 640444,
 621016,
 456501,
 641501,
 506433,
 605200,
 656354,
 608328,
 660853,
 660787,
 594798,
 672282,
 446321,
 664747,
 621242,
 