# Association Rule Mining for 2020 Season

## Preprocess

In [87]:
import pandas as pd

batters = pd.read_csv("Batters_With_Clusters.csv")
pitchers = pd.read_csv("Pitchers_With_Clusters.csv")

In [88]:
print(batters.columns.tolist())
print(pitchers.columns.tolist())


['year', 'player_id', 'last_name, first_name', 'HitterType2_Group', 'HitterType2', 'ClusteringStats_Group', 'ClusteringStats']
['player_id', 'year', 'last_name, first_name', 'kmeans_quality_cluster', 'quality_archetype', 'impact_cluster', 'impact_archetype']


### Read play data

In [89]:
import pandas as pd

chunksize = 200_000
reader = pd.read_csv("./pitch_level_playbyplay_data/statcast_pitch_by_pitch_2020.csv", chunksize=chunksize)

dfs = []
for chunk in reader:
    dfs.append(chunk)          # or process chunk immediately
df = pd.concat(dfs, ignore_index=True)


In [90]:
print(df.columns.tolist())


['pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'player_name', 'batter', 'pitcher', 'events', 'description', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk', 'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value', 'iso_value', 'launch_speed_a

In [91]:
print(df['events'].unique())
print(df['description'].unique()[:50])  # sample first 50
print(df['bb_type'].unique())


['strikeout' nan 'field_out' 'grounded_into_double_play' 'walk' 'home_run'
 'intent_walk' 'double' 'single' 'fielders_choice' 'triple' 'truncated_pa'
 'force_out' 'sac_bunt' 'hit_by_pitch' 'sac_fly' 'fielders_choice_out'
 'double_play' 'strikeout_double_play' 'field_error' 'catcher_interf'
 'sac_fly_double_play' 'triple_play']
['called_strike' 'swinging_strike' 'ball' 'foul' 'hit_into_play'
 'blocked_ball' 'automatic_ball' 'swinging_strike_blocked' 'foul_bunt'
 'foul_tip' 'hit_by_pitch' 'missed_bunt' 'pitchout' 'bunt_foul_tip']
[nan 'fly_ball' 'ground_ball' 'line_drive' 'popup']


In [92]:
import pandas as pd

def make_matchup(stand, p_throws):
    """
    stand: batter handedness ('L' or 'R')
    p_throws: pitcher handedness ('L' or 'R')
    """
    return f"{stand}vs{p_throws}"   # e.g. 'LvsR'

def bucket_count(balls, strikes):
    """
    Bucket balls/strikes into a small number of baseball-meaningful states.
    """
    # Full count first (since it's also two strikes)
    if balls == 3 and strikes == 2:
        return "full"

    # Two strikes (non-full counts)
    if strikes == 2:
        return "two_strikes"

    # Hitter clearly ahead
    if balls - strikes >= 2:
        return "hitter_ahead"

    # Pitcher clearly ahead
    if strikes > balls:
        return "pitcher_ahead"

    # Everything else (0-0, 1-1, 2-1, etc.)
    return "even"

def clean_outcome(events):
    """
    Use Statcast 'events' as the outcome label.
    If it's NaN or something weird, return NaN so we can drop it later.
    """
    if pd.isna(events):
        return pd.NA
    return str(events)


In [93]:
def build_plate_appearances(df_raw):
    """
    From pitch-by-pitch Statcast, keep only the final pitch of each PA.
    """
    # Ensure sort order within each PA
    df_sorted = df_raw.sort_values(['game_pk', 'at_bat_number', 'pitch_number'])

    # Last pitch in each (game_pk, at_bat_number) group = PA outcome pitch
    df_pa = (df_sorted
             .groupby(['game_pk', 'at_bat_number'], as_index=False)
             .tail(1)
             .copy())

    # Make sure 'year' column exists and is numeric
    if 'year' not in df_pa.columns and 'game_year' in df_pa.columns:
        df_pa['year'] = df_pa['game_year']
    df_pa['year'] = df_pa['year'].astype(int)

    return df_pa


In [94]:
def enrich_context(df_pa):
    """
    Add matchup (LvsR, etc.), count_bucket, and outcome from 'events'.
    """

    # Matchup
    df_pa['matchup'] = df_pa.apply(
        lambda row: make_matchup(row['stand'], row['p_throws']),
        axis=1
    )

    # Count bucket
    df_pa['count_bucket'] = df_pa.apply(
        lambda row: bucket_count(row['balls'], row['strikes']),
        axis=1
    )

    # Outcome
    df_pa['outcome'] = df_pa['events'].apply(clean_outcome)

    # Drop rows with missing critical info
    df_pa = df_pa.dropna(subset=['batter_arch', 'pitcher_arch', 'outcome'])

    return df_pa


In [95]:
def add_archetypes(
    df_pa,
    hitters_df,
    pitchers_df,
    batter_cluster_col="HitterType2",      # or "ClusteringStats"
    pitcher_cluster_col="impact_archetype" # or "quality_archetype"
):
    """
    Merge batter & pitcher archetypes into the PA-level DataFrame.
    """

    # --- Merge batters ---
    bat_cols = ['player_id', 'year', batter_cluster_col]
    hitters_small = hitters_df[bat_cols].rename(
        columns={'player_id': 'batter'}
    )

    df_pa = df_pa.merge(
        hitters_small,
        on=['batter', 'year'],
        how='left'
    )

    # --- Merge pitchers ---
    pit_cols = ['player_id', 'year', pitcher_cluster_col]
    pitchers_small = pitchers_df[pit_cols].rename(
        columns={'player_id': 'pitcher'}
    )

    df_pa = df_pa.merge(
        pitchers_small,
        on=['pitcher', 'year'],
        how='left',
        suffixes=('_bat', '_pit')
    )

    # Rename cluster columns to unified names
    df_pa = df_pa.rename(columns={
        batter_cluster_col: 'batter_arch',
        pitcher_cluster_col: 'pitcher_arch'
    })

    return df_pa


In [96]:
def build_transactions_df(df_pa):
    """
    Keep just the fields we want as "items" in each transaction.
    """
    df_tx = df_pa[['batter_arch', 'pitcher_arch', 'matchup', 'count_bucket', 'outcome']].copy()
    return df_tx


In [97]:
def encode_transactions(df_tx):
    """
    One-hot encode the transactional fields into a boolean 0/1 DataFrame.
    Columns will look like:
      - batter_arch_<value>
      - pitcher_arch_<value>
      - matchup_LvsR
      - count_bucket_two_strikes
      - outcome_strikeout
    """
    cols = ['batter_arch', 'pitcher_arch', 'matchup', 'count_bucket', 'outcome']

    encoded_parts = []
    for col in cols:
        dummies = pd.get_dummies(df_tx[col], prefix=col)
        encoded_parts.append(dummies)

    df_encoded = pd.concat(encoded_parts, axis=1)

    # Boolean is lighter than int
    df_encoded = df_encoded.astype(bool)

    return df_encoded


In [98]:
def preprocess_statcast(
    df_raw,
    hitters_df,
    pitchers_df,
    batter_cluster_col="HitterType2",
    pitcher_cluster_col="impact_archetype"
):
    # 1) Pitch => PA
    df_pa = build_plate_appearances(df_raw)

    # 2) Merge archetypes
    df_pa = add_archetypes(
        df_pa,
        hitters_df=hitters_df,
        pitchers_df=pitchers_df,
        batter_cluster_col=batter_cluster_col,
        pitcher_cluster_col=pitcher_cluster_col
    )

    # 3) Add matchup, count bucket, and outcome
    df_pa = enrich_context(df_pa)

    # 4) Build transaction-level
    df_tx = build_transactions_df(df_pa)

    # 5) One-hot encode
    df_encoded = encode_transactions(df_tx)

    return df_pa, df_tx, df_encoded


In [99]:
df_pa_2020, df_tx_2020, df_encoded_2020 = preprocess_statcast(
    df_raw=df,
    hitters_df=batters,
    pitchers_df=pitchers,
    # batter_cluster_col="HitterType2",        # or "ClusteringStats"
    batter_cluster_col="ClusteringStats",
    pitcher_cluster_col="impact_archetype"   # or "quality_archetype"
)


In [100]:
df_pa_2020.to_csv("./2020/pa_2020.csv", index=False)
df_tx_2020.to_csv("./2020/tx_2020.csv", index=False)
df_encoded_2020.to_csv("./2020/encoded_2020.csv", index=False)

In [101]:
unique_outcomes = df_tx_2020['outcome'].dropna().unique()

In [102]:
for attr in unique_outcomes:
    print(f'{attr}: {(df_tx_2020['outcome'] == attr).sum() / len(df_tx_2020) * 100}%')

walk: 8.875431883196493%
double: 4.4470037522755135%
field_out: 38.34751272430063%
strikeout: 22.37619348367203%
single: 14.132332726529704%
sac_fly: 0.5795593862614704%
intent_walk: 0.39751829698703417%
home_run: 3.707694022365048%
triple: 0.3789426756324999%
force_out: 2.073039343166029%
hit_by_pitch: 1.0699557900211762%
grounded_into_double_play: 1.850131886911617%
fielders_choice: 0.22662258052531858%
field_error: 0.7653155998068135%
truncated_pa: 0.14117472229446076%
catcher_interf: 0.04086636697997548%
fielders_choice_out: 0.18575621354534308%
strikeout_double_play: 0.08916298250176469%
double_play: 0.22662258052531858%
sac_bunt: 0.08173273395995095%
sac_fly_double_play: 0.007430248541813724%


In [103]:
home_run_percentage = (df_tx_2020['outcome'] == 'home_run').sum() / len(df_tx_2020)
print(f'Home run percentage: {home_run_percentage * 100}%')

Home run percentage: 3.707694022365048%


In [104]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

minsup_count = 40
minsup = minsup_count / len(df_encoded_2020)

frequent_itemsets = fpgrowth(df_encoded_2020, min_support=minsup, use_colnames=True)
# rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=minconf)


In [105]:
minconf = home_run_percentage
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=minconf)


  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [106]:
good_outcomes = [
    'outcome_single',
    'outcome_double',
    'outcome_triple',
    'outcome_home_run',
    # 'outcome_walk',
    'outcome_intent_walk',
    # 'outcome_hit_by_pitch',
    # 'outcome_field_error'
]


In [107]:

rules = rules[
    rules['consequents'].apply(
        lambda s: any(item in good_outcomes for item in s)
    )
]


In [108]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1099,(count_bucket_hitter_ahead),(outcome_single),0.089646,0.141323,0.008656,0.096560,0.683258,1.0,-0.004013,0.950453,-0.337409,0.038937,-0.052130,0.078906
1388,"(count_bucket_hitter_ahead, batter_arch_Balanc...",(outcome_single),0.089646,0.141323,0.008656,0.096560,0.683258,1.0,-0.004013,0.950453,-0.337409,0.038937,-0.052130,0.078906
1391,(count_bucket_hitter_ahead),"(batter_arch_Balanced / Mixed, outcome_single)",0.089646,0.141323,0.008656,0.096560,0.683258,1.0,-0.004013,0.950453,-0.337409,0.038937,-0.052130,0.078906
1393,"(matchup_RvsR, count_bucket_hitter_ahead)",(outcome_single),0.029981,0.141323,0.003418,0.114002,0.806678,1.0,-0.000819,0.969164,-0.198113,0.020358,-0.031817,0.069094
1396,(count_bucket_hitter_ahead),"(matchup_RvsR, outcome_single)",0.089646,0.056730,0.003418,0.038127,0.672076,1.0,-0.001668,0.980659,-0.348948,0.023909,-0.019722,0.049188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10401,"(count_bucket_even, pitcher_arch_Sinker Liabil...","(batter_arch_Balanced / Mixed, outcome_single,...",0.025560,0.026452,0.001486,0.058140,2.197952,1.0,0.000810,1.033644,0.559327,0.029412,0.032549,0.057160
10402,(pitcher_arch_Sinker Liability Pitchers (Sinke...,"(batter_arch_Balanced / Mixed, outcome_single,...",0.015046,0.059479,0.001486,0.098765,1.660505,1.0,0.000591,1.043592,0.403850,0.020346,0.041771,0.061875
10562,"(count_bucket_pitcher_ahead, pitcher_arch_Sink...",(outcome_single),0.006873,0.141323,0.001635,0.237838,1.682934,1.0,0.000663,1.126633,0.408608,0.011153,0.112399,0.124702
10584,"(count_bucket_pitcher_ahead, batter_arch_Balan...",(outcome_single),0.006873,0.141323,0.001635,0.237838,1.682934,1.0,0.000663,1.126633,0.408608,0.011153,0.112399,0.124702


In [109]:
rules['ante_len'] = rules['antecedents'].apply(lambda x: len(x))


In [110]:
max_len = rules['ante_len'].max()
print("Max antecedent size:", max_len)


Max antecedent size: 4


In [111]:
rules['cons_len'] = rules['consequents'].apply(lambda x: len(x))

In [112]:
rules_sorted = rules.sort_values(
    by=[ 'lift', 'confidence'],
    ascending=[False, False]
)


In [113]:
# rules_sorted = rules_sorted[rules_sorted['ante_len'] == max_len]


In [114]:
rules_sorted = rules_sorted[
    (rules_sorted['cons_len'] == 1) &
    (rules_sorted['consequents'].apply(lambda s: list(s)[0].startswith('outcome_')))
]


In [115]:
rules_sorted = rules_sorted[
    (rules_sorted['ante_len'] == 2) &
    (rules_sorted['antecedents'].apply(
        lambda s: any(item.startswith('batter_arch_') for item in s)
        and any(item.startswith('pitcher_arch_') for item in s)
    ))
]


In [116]:
rules_sorted.to_csv("./2020/rules_sorted_by_conf_support.csv", index=False)
