# Association Rule Mining for 2021 Season

## Preprocess

In [1]:
import pandas as pd

batters = pd.read_csv("Batters_With_Clusters.csv")
pitchers = pd.read_csv("Pitchers_With_Clusters.csv")

In [2]:
print(batters.columns.tolist())
print(pitchers.columns.tolist())

['year', 'player_id', 'last_name, first_name', 'HitterType2_Group', 'HitterType2', 'ClusteringStats_Group', 'ClusteringStats']
['player_id', 'year', 'last_name, first_name', 'kmeans_quality_cluster', 'quality_archetype', 'impact_cluster', 'impact_archetype']


### Read play data

In [3]:
import pandas as pd

chunksize = 200_000
reader = pd.read_csv("./pitch_level_playbyplay_data/statcast_pitch_by_pitch_2021.csv", chunksize=chunksize)

dfs = []
for chunk in reader:
    dfs.append(chunk)          # or process chunk immediately
df = pd.concat(dfs, ignore_index=True)


In [4]:
print(df.columns.tolist())


['pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'player_name', 'batter', 'pitcher', 'events', 'description', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk', 'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value', 'iso_value', 'launch_speed_a

In [5]:
print(df['events'].unique())
print(df['description'].unique()[:50])  # sample first 50
print(df['bb_type'].unique())


['field_out' nan 'single' 'strikeout' 'home_run' 'double' 'walk'
 'grounded_into_double_play' 'field_error' 'intent_walk' 'force_out'
 'sac_fly' 'hit_by_pitch' 'fielders_choice' 'triple' 'fielders_choice_out'
 'strikeout_double_play' 'truncated_pa' 'double_play' 'sac_bunt'
 'catcher_interf' 'sac_fly_double_play' 'sac_bunt_double_play'
 'triple_play']
['hit_into_play' 'foul' 'called_strike' 'ball' 'blocked_ball'
 'swinging_strike' 'swinging_strike_blocked' 'foul_tip' 'foul_bunt'
 'automatic_ball' 'hit_by_pitch' 'missed_bunt' 'pitchout' 'bunt_foul_tip'
 'foul_pitchout']
['ground_ball' nan 'fly_ball' 'line_drive' 'popup']


In [6]:
import pandas as pd

def make_matchup(stand, p_throws):
    """
    stand: batter handedness ('L' or 'R')
    p_throws: pitcher handedness ('L' or 'R')
    """
    return f"{stand}vs{p_throws}"   # e.g. 'LvsR'

def bucket_count(balls, strikes):
    """
    Bucket balls/strikes into a small number of baseball-meaningful states.
    """
    # Full count first (since it's also two strikes)
    if balls == 3 and strikes == 2:
        return "full"

    # Two strikes (non-full counts)
    if strikes == 2:
        return "two_strikes"

    # Hitter clearly ahead
    if balls - strikes >= 2:
        return "hitter_ahead"

    # Pitcher clearly ahead
    if strikes > balls:
        return "pitcher_ahead"

    # Everything else (0-0, 1-1, 2-1, etc.)
    return "even"

def clean_outcome(events):
    """
    Use Statcast 'events' as the outcome label.
    If it's NaN or something weird, return NaN so we can drop it later.
    """
    if pd.isna(events):
        return pd.NA
    return str(events)


In [7]:
def build_plate_appearances(df_raw):
    """
    From pitch-by-pitch Statcast, keep only the final pitch of each PA.
    """
    # Ensure sort order within each PA
    df_sorted = df_raw.sort_values(['game_pk', 'at_bat_number', 'pitch_number'])

    # Last pitch in each (game_pk, at_bat_number) group = PA outcome pitch
    df_pa = (df_sorted
             .groupby(['game_pk', 'at_bat_number'], as_index=False)
             .tail(1)
             .copy())

    # Make sure 'year' column exists and is numeric
    if 'year' not in df_pa.columns and 'game_year' in df_pa.columns:
        df_pa['year'] = df_pa['game_year']
    df_pa['year'] = df_pa['year'].astype(int)

    return df_pa


In [8]:
def enrich_context(df_pa):
    """
    Add matchup (LvsR, etc.), count_bucket, and outcome from 'events'.
    """

    # Matchup
    df_pa['matchup'] = df_pa.apply(
        lambda row: make_matchup(row['stand'], row['p_throws']),
        axis=1
    )

    # Count bucket
    df_pa['count_bucket'] = df_pa.apply(
        lambda row: bucket_count(row['balls'], row['strikes']),
        axis=1
    )

    # Outcome
    df_pa['outcome'] = df_pa['events'].apply(clean_outcome)

    # Drop rows with missing critical info
    df_pa = df_pa.dropna(subset=['batter_arch', 'pitcher_arch', 'outcome'])

    return df_pa


In [9]:
def add_archetypes(
    df_pa,
    hitters_df,
    pitchers_df,
    batter_cluster_col="HitterType2",      # or "ClusteringStats"
    pitcher_cluster_col="impact_archetype" # or "quality_archetype"
):
    """
    Merge batter & pitcher archetypes into the PA-level DataFrame.
    """

    # --- Merge batters ---
    bat_cols = ['player_id', 'year', batter_cluster_col]
    hitters_small = hitters_df[bat_cols].rename(
        columns={'player_id': 'batter'}
    )

    df_pa = df_pa.merge(
        hitters_small,
        on=['batter', 'year'],
        how='left'
    )

    # --- Merge pitchers ---
    pit_cols = ['player_id', 'year', pitcher_cluster_col]
    pitchers_small = pitchers_df[pit_cols].rename(
        columns={'player_id': 'pitcher'}
    )

    df_pa = df_pa.merge(
        pitchers_small,
        on=['pitcher', 'year'],
        how='left',
        suffixes=('_bat', '_pit')
    )

    # Rename cluster columns to unified names
    df_pa = df_pa.rename(columns={
        batter_cluster_col: 'batter_arch',
        pitcher_cluster_col: 'pitcher_arch'
    })

    return df_pa


In [10]:
def build_transactions_df(df_pa):
    """
    Keep just the fields we want as "items" in each transaction.
    """
    df_tx = df_pa[['batter_arch', 'pitcher_arch', 'matchup', 'count_bucket', 'outcome']].copy()
    return df_tx


In [11]:
def encode_transactions(df_tx):
    """
    One-hot encode the transactional fields into a boolean 0/1 DataFrame.
    Columns will look like:
      - batter_arch_<value>
      - pitcher_arch_<value>
      - matchup_LvsR
      - count_bucket_two_strikes
      - outcome_strikeout
    """
    cols = ['batter_arch', 'pitcher_arch', 'matchup', 'count_bucket', 'outcome']

    encoded_parts = []
    for col in cols:
        dummies = pd.get_dummies(df_tx[col], prefix=col)
        encoded_parts.append(dummies)

    df_encoded = pd.concat(encoded_parts, axis=1)

    # Boolean is lighter than int
    df_encoded = df_encoded.astype(bool)

    return df_encoded


In [12]:
def preprocess_statcast(
    df_raw,
    hitters_df,
    pitchers_df,
    batter_cluster_col="HitterType2",
    pitcher_cluster_col="impact_archetype"
):
    # 1) Pitch => PA
    df_pa = build_plate_appearances(df_raw)

    # 2) Merge archetypes
    df_pa = add_archetypes(
        df_pa,
        hitters_df=hitters_df,
        pitchers_df=pitchers_df,
        batter_cluster_col=batter_cluster_col,
        pitcher_cluster_col=pitcher_cluster_col
    )

    # 3) Add matchup, count bucket, and outcome
    df_pa = enrich_context(df_pa)

    # 4) Build transaction-level
    df_tx = build_transactions_df(df_pa)

    # 5) One-hot encode
    df_encoded = encode_transactions(df_tx)

    return df_pa, df_tx, df_encoded


In [13]:
df_pa_2021, df_tx_2021, df_encoded_2021 = preprocess_statcast(
    df_raw=df,
    hitters_df=batters,
    pitchers_df=pitchers,
    batter_cluster_col="ClusteringStats",        # or "ClusteringStats"
    pitcher_cluster_col="impact_archetype"   # or "quality_archetype"
)


In [14]:
df_pa_2021.to_csv("./2021/pa_2021.csv", index=False)
df_tx_2021.to_csv("./2021/tx_2021.csv", index=False)
df_encoded_2021.to_csv("./2021/encoded_2021.csv", index=False)

In [15]:
unique_outcomes = df_tx_2021['outcome'].dropna().unique()

In [16]:
for attr in unique_outcomes:
    print(f'{attr}: {(df_tx_2021['outcome'] == attr).sum() / len(df_tx_2021) * 100}%')

single: 14.50752295342018%
field_out: 39.52955503212678%
walk: 8.41224823342787%
strikeout: 21.219695877197196%
double: 4.701245012738547%
home_run: 3.8552131904052303%
field_error: 0.7098335176016279%
truncated_pa: 0.10575397779166466%
grounded_into_double_play: 1.7785896264961785%
strikeout_double_play: 0.08973064782323062%
sac_bunt: 0.09934464580429105%
intent_walk: 0.40859491419506805%
fielders_choice: 0.21471262157701615%
sac_fly: 0.6313192007563011%
hit_by_pitch: 1.0367094489576825%
force_out: 1.9372205931836755%
triple: 0.3669342562771395%
double_play: 0.2067009565927991%
sac_fly_double_play: 0.006409331987373617%
fielders_choice_out: 0.13780063772853277%
catcher_interf: 0.0416606579179285%
triple_play: 0.0032046659936868084%


In [17]:
home_run_percentage = (df_tx_2021['outcome'] == 'home_run').sum() / len(df_tx_2021)
print(f'Home run percentage: {home_run_percentage * 100}%')

Home run percentage: 3.8552131904052303%


In [18]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

minsup_count = 40
minsup = minsup_count / len(df_encoded_2021)

frequent_itemsets = fpgrowth(df_encoded_2021, min_support=minsup, use_colnames=True)
# rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=minconf)


In [19]:
minconf = home_run_percentage
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=minconf)


In [20]:
good_outcomes = [
    'outcome_single',
    'outcome_double',
    'outcome_triple',
    'outcome_home_run',
    # 'outcome_walk',
    'outcome_intent_walk',
    # 'outcome_hit_by_pitch',
    # 'outcome_field_error'
]


In [21]:

rules = rules[
    rules['consequents'].apply(
        lambda s: any(item in good_outcomes for item in s)
    )
]


In [22]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
587,(pitcher_arch_Balanced Positive Arsenal (Mild ...,(outcome_single),0.568860,0.145075,0.084411,0.148386,1.022821,1.0,0.001883,1.003888,0.051751,0.134087,0.003873,0.365114
588,(batter_arch_Selective / Moderate),(outcome_single),0.344405,0.145075,0.046756,0.135759,0.935782,1.0,-0.003209,0.989220,-0.094757,0.105610,-0.010897,0.229024
591,(count_bucket_two_strikes),(outcome_single),0.378840,0.145075,0.043215,0.114072,0.786294,1.0,-0.011745,0.965005,-0.304372,0.089900,-0.036264,0.205976
593,(matchup_LvsR),(outcome_single),0.281354,0.145075,0.038664,0.137422,0.947249,1.0,-0.002153,0.991128,-0.071918,0.099711,-0.008951,0.201967
595,(batter_arch_Aggressive Fastball),(outcome_single),0.655595,0.145075,0.098319,0.149969,1.033736,1.0,0.003209,1.005758,0.094757,0.139986,0.005725,0.413840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15626,"(count_bucket_pitcher_ahead, pitcher_arch_Sink...",(outcome_single),0.005784,0.145075,0.001186,0.204986,1.412965,1.0,0.000347,1.075358,0.293969,0.007922,0.070077,0.106580
15715,"(count_bucket_pitcher_ahead, batter_arch_Aggre...",(outcome_single),0.003782,0.145075,0.000817,0.216102,1.489584,1.0,0.000269,1.090607,0.329919,0.005520,0.083079,0.110867
15718,"(count_bucket_pitcher_ahead, pitcher_arch_Sink...","(outcome_single, batter_arch_Aggressive Fastball)",0.005784,0.098319,0.000817,0.141274,1.436894,1.0,0.000248,1.050022,0.305824,0.007912,0.047639,0.074793
15742,(pitcher_arch_Sinker Liability Pitchers (Sinke...,(outcome_single),0.006874,0.145075,0.000657,0.095571,0.658769,1.0,-0.000340,0.945265,-0.342783,0.004342,-0.057905,0.050050


In [23]:
rules['ante_len'] = rules['antecedents'].apply(lambda x: len(x))


In [24]:
max_len = rules['ante_len'].max()
print("Max antecedent size:", max_len)


Max antecedent size: 4


In [25]:
rules['cons_len'] = rules['consequents'].apply(lambda x: len(x))

In [26]:
rules_sorted = rules.sort_values(
    by=[ 'lift', 'confidence'],
    ascending=[False, False]
)


In [27]:
# rules_sorted = rules_sorted[rules_sorted['ante_len'] == max_len]


In [28]:
rules_sorted = rules_sorted[
    (rules_sorted['cons_len'] == 1) &
    (rules_sorted['consequents'].apply(lambda s: list(s)[0].startswith('outcome_')))
]


In [29]:
rules_sorted = rules_sorted[
    (rules_sorted['ante_len'] == 2) &
    (rules_sorted['antecedents'].apply(
        lambda s: any(item.startswith('batter_arch_') for item in s)
        and any(item.startswith('pitcher_arch_') for item in s)
    ))
]


In [30]:
rules_sorted.to_csv("./2021/rules_sorted_by_conf_support.csv", index=False)
