In [1]:
from mlb_analytics import *
import json
import pandas as pd
import numpy as np
import plotly.express as px 
import plotly.graph_objects as go

In [2]:
power = pd.read_csv("power_rankings_2025.csv")
standings = pd.read_csv("standings_2025.csv")
odds = pd.read_csv("odds_2025.csv")
pitching = pd.read_csv("pitching_stats_2025.csv")
batting = pd.read_csv("batting_stats_2025.csv")
fielding = pd.read_csv("fielding_stats_2025.csv")

teams = json.load(open("teams_2025.json"))
tms = json.load(open("tms_2025.json"))

In [3]:
# Get the power ranking, mlb ranking, ranking diff for TOR and NYY
df1 = build_plot_table(
    power=power, 
    standings=standings, 
    selected_codes=["TOR", "NYY"], 
    team_names=teams,
    team_codes=tms, 
    mode="both"
    )

# Plot the power ranking and mlb ranking for TOR and NYY
fig = px.line(
    df1,
    x="date",
    y=df1.columns[1:],
    title="Power and MLB Rankings for TOR and NYY",
    labels={"value": "Rank", "variable": "Team"},
    color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_yaxes(autorange="reversed")  # Reverse y-axis for rankings
fig.show()

In [4]:
# Get rank change KDE and histogram data for TOR and NYY
kde_data, hist_data, peaks, _ = build_delta_kde_and_hist(
    power=power,
    standings=standings,
    team_names=teams,
    team_codes=tms,
    selected_codes=["TOR", "NYY"],
    source="power",
    grid=np.linspace(-15, 15, 300),
    bin_edges=np.linspace(-15, 15, 31)
)

# Plot the KDE and histogram for TOR and NYY
fig = px.line(
    kde_data,
    x="x",
    y="density",
    color="label",
    line_group="team_code",
    title="KDE of ΔRank (Power Rankings)",
    labels={"x": "ΔRank", "density": "Density"}
)

# Add very light histogram bars also to the plot
for team_code in hist_data["team_code"].unique():
    team_hist = hist_data[hist_data["team_code"] == team_code]
    fig.add_trace(go.Bar(
        x=team_hist["x"],
        y=team_hist["pdf"],
        name=f"{team_code} Histogram",
        opacity=0.6,
        showlegend=False
    ))

# Add peak lines to the plot
for team_code, peak in peaks.items():
    fig.add_vline(
        x=peak,
        line=dict(color="grey", dash="dash"),
        annotation_text=f"Peak: {peak:.2f}",
        annotation_position="top right"
    )
fig.show()

In [5]:
volatiliy_data = build_rank_volatility(
    power=power,
    standings=standings,
    team_names=teams,
    team_codes=tms,
    selected_codes=["TOR", "NYY"],
    source="mlb"
)

fig = px.line(
    volatiliy_data,
    x="date",
    y="sigma",
    color="team_code",
    title="Rank Volatility (MLB Rankings)",
    labels={"sigma": "Volatility", "team_code": "Team", "date": "Date"}
)

fig.show()

In [6]:
stab_df, acf_df = build_acf_stability_timeseries(
    power=power,
    standings=standings,
    team_names=teams,
    team_codes=tms,
    team_code="TOR",
    source="mlb",
    max_lag=4,
    return_acf=True,
)

fig = px.line(
    stab_df,
    x="date",
    y="value",
    color="lag",
    title="Stability (Δz) Time Series for TOR (MLB Rankings)",
    labels={"value": "Stability", "date": "Date", "lag": "Lookback (weeks)"}
)

fig.show()

fig = px.line(
    acf_df,
    x="date",
    y="value",
    color="lag",
    title="ACF Time Series for TOR (MLB Rankings)",
    labels={"value": "Consistency", "date": "Date", "lag": "Lookback (weeks)"}
)

fig.show()


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply



In [26]:
granger_df, stats = granger_power_to_mlb_report(
    power=power,
    standings=standings,
    team_names=teams,
    team_codes=tms,
    team_code="NYY",
    max_lag=6
)

# plot bar chart of p-values
fig = px.bar(
    granger_df,
    x="lag",
    y="p_value",
    title="Granger Causality Test: Power Rankings causing MLB Rankings for TOR",
    labels={"lag": "Lag (weeks)", "p_value": "p-value"}
)

fig.add_hline(
    y=0.05,
    line=dict(color="red", dash="dash"),
    annotation_text="Significance Threshold (0.05)",
    annotation_position="top right"
)

fig.show()

stats


verbose is deprecated since functions should not print results



{'team_code': 'NYY',
 'team_id': 'new-york-yankees',
 'label': 'New York Yankees',
 'n_obs_raw': 22,
 'n_obs_used': 21,
 'max_lag_requested': 6,
 'max_lag_effective': 0,
 'best_lag': 3,
 'best_p': 0.014015521848057818,
 'is_significant': True,
 'alpha': 0.05,
 'direction': 'power_to_mlb',
 'diff': 'first_difference',
 'maxlag_effective': 6}

In [9]:
stats = compute_trajectory_similarity(
    power=power,
    standings=standings,
    team_names=teams,
    team_codes=tms,
    team_code_a="TOR",
    team_code_b="NYY",
    source="mlb"
)

# Pretty print the stats
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(stats)

{   'avg_abs_rank_gap': 7.2727272727272725,
    'corr_delta': -0.26160004033323414,
    'corr_levels': -0.33142279218780113,
    'dtw_raw': 90.0,
    'dtw_similarity_raw_0_100': 91.61230195712955,
    'dtw_similarity_z': 69.28960703760188,
    'dtw_z': 14.626190147234718,
    'overlap': 22,
    'source': 'mlb',
    'team_a': 'Toronto Blue Jays',
    'team_b': 'New York Yankees'}


In [18]:
clusters = cluster_and_summarize_season_stats(
    standings=standings,
    odds=odds,
    batting=batting,
    pitching=pitching,
    fielding=fielding,
    k=6
)

clusters

Unnamed: 0,cluster,n_teams,avg_last_mlb_rank,median_last_mlb_rank,made_playoffs,pct_playoffs,teams
0,6,5,3.6,3.0,5,1.0,"los-angeles-dodgers, milwaukee-brewers, new-yo..."
1,5,3,12.666667,11.0,2,0.666667,"chicago-cubs, detroit-tigers, tampa-bay-rays"
2,3,9,14.888889,15.0,3,0.333333,"cincinnati-reds, cleveland-guardians, houston-..."
3,4,7,15.142857,18.0,2,0.285714,"arizona-diamondbacks, athletics, atlanta-brave..."
4,2,4,26.25,26.0,0,0.0,"baltimore-orioles, chicago-whitesox, los-angel..."
5,1,2,29.0,29.0,0,0.0,"colorado-rockies, washington-nationals"


In [24]:
powerx = prepare_power_features_for_hmm(power)
states_df, meta_stats = fit_team_hmm(
    power=power,
    team_code="TOR",
    team_codes=tms,
    team_names=teams,
    power_features=powerx,
)

# Pretty print the meta_stats
pp.pprint(meta_stats)
states_df

{   'P':                   Good      Mediocre           Bad
Good      8.580794e-01  1.419202e-01  4.700286e-07
Mediocre  5.727542e-12  8.326306e-01  1.673694e-01
Bad       1.432880e-01  8.074034e-25  8.567120e-01,
    'init': 'kmeans',
    'means':           level_dev     chg_z    mom3_z
Good       0.230862  0.782281  1.622715
Mediocre   0.833429  0.003409  0.037149
Bad       -0.811669 -0.388157 -0.861374,
    'n_used': 21,
    'pi':        Good  Mediocre       Bad
0  0.352311   0.29874  0.348949}



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



Unnamed: 0,date,state,label
0,2025-04-13,2,Bad
1,2025-04-20,2,Bad
2,2025-04-27,2,Bad
3,2025-05-04,2,Bad
4,2025-05-11,2,Bad
5,2025-05-18,2,Bad
6,2025-05-25,2,Bad
7,2025-06-08,0,Good
8,2025-06-15,0,Good
9,2025-06-22,0,Good


In [25]:
# All checks passed!