In [19]:
import pandas as pd
import numpy as np

In [34]:
# CONFIG

OUT_PATH = "../Week3/HP"

In [20]:
# Load the dataset
df = pd.read_excel('../Week2/HP/hp.xlsx')

# Parse Score into numeric columns
df[['Score1', 'Score2']] = df['Score'].str.split('-', expand=True).astype(int)

# Compute points per hill for each team
df['Points1'] = df.groupby(['Date','Map','Team1','Team2'])['Score1'].diff().fillna(df['Score1'])
df['Points2'] = df.groupby(['Date','Map','Team1','Team2'])['Score2'].diff().fillna(df['Score2'])

# Keep the original row order in a column
df = df.reset_index().rename(columns={'index':'OrigRow'})

# Filter for relevant teams
relevant_teams = ['Q9', 'OUG', 'SPG', 'XROCK', 'GodL', 'Wolves']

df_masters = df[df['Team1'].isin(relevant_teams) | df['Team2'].isin(relevant_teams)].copy()

# Prepare long format for points
long_pts = pd.concat([
    df_masters[['Map','Hill','Team1','Points1']].rename(columns={'Team1':'Team','Points1':'Points'}),
    df_masters[['Map','Hill','Team2','Points2']].rename(columns={'Team2':'Team','Points2':'Points'})
])

long_pts.head()

Unnamed: 0,Map,Hill,Team,Points
0,Summit,P1,OUG,13.0
1,Summit,P2,OUG,18.0
2,Summit,P3,OUG,39.0
3,Summit,P4,OUG,19.0
4,Summit,P1,OUG,7.0


## Team-Level Stats

In [21]:
# List of all teams
teams = sorted(set(long_pts['Team']))

# Team-level stats
team_stats = []

for team in teams:
    played = df_masters[(df_masters['Team1']==team) | (df_masters['Team2']==team)]
    # Rotation-win %
    rf = played[played['RotateFirst']==team]
    rot_rate = (rf['RotationWin']=='Yes').mean() * 100 if not rf.empty else np.nan
    # Break-success %
    opp = played[played['RotateFirst']!=team]
    breaks = played[(played['BreakTeam']==team) & (played['BreakSuccess']=='Yes')]
    break_rate = len(breaks) / len(opp) * 100 if len(opp)>0 else np.nan
    # Avg durations
    avg_hold = rf['HoldDuration'].mean()
    avg_break_hold = played[played['BreakTeam']==team]['BreakDuration'].mean()
    # Scrap points
    scrap_pts = df_masters[df_masters['ScrapTeam']==team]['ScrapTime'].sum()
    # Control-Share%
    
    team_stats.append({
        'Team': team,
        'RotationWin': rot_rate,
        'BreakSuccess': break_rate,
        'AvgHoldDuration (s)': avg_hold,
        'AvgBreakDuration (s)': avg_break_hold,
        'ScrapPoints': scrap_pts,
    })

team_stats_df = pd.DataFrame(team_stats).set_index('Team')

In [22]:
# Most dominant hills
team_hill_means = long_pts.groupby(['Team', 'Map', 'Hill'])['Points'].mean().reset_index()
top_hills = (
    team_hill_means
    .sort_values(['Team', 'Points'], ascending=[True, False])
    .groupby('Team', as_index=False)
    .first()
)
top_hills['TopHill'] = top_hills['Map'] + ' ' + top_hills['Hill']
top_hills = top_hills[['Team', 'TopHill', 'Points']].rename(columns={'Points': 'TopHillAvgPts'})


In [23]:
# merge team stats with top hills
team_stats_df = team_stats_df.merge(top_hills, on='Team', how='left')

# Calculate distance from begin perfect at rotation-win and break-success
team_stats_df['RotBreakDist'] = np.sqrt((team_stats_df['RotationWin'] - 100)**2 + (team_stats_df['BreakSuccess'] - 100)**2)
team_stats_df = team_stats_df.sort_values('RotBreakDist')
team_stats_df = team_stats_df.drop(columns='RotBreakDist')

In [24]:
team_stats_df

Unnamed: 0,Team,RotationWin,BreakSuccess,AvgHoldDuration (s),AvgBreakDuration (s),ScrapPoints,TopHill,TopHillAvgPts
7,Wolves,47.169811,47.916667,15.490566,16.458333,494,Slums P2,44.0
2,OUG,47.297297,40.298507,17.837838,13.402985,493,Slums P3,36.5
1,GodL,48.148148,34.782609,16.055556,12.565217,394,Combine P4,32.5
8,XC,66.666667,14.285714,20.666667,6.714286,18,Apocalypse P2,17.333333
3,Q9,43.396226,25.0,13.264151,10.235294,448,Hacienda P3,38.0
10,XROCK,40.0,21.73913,14.218182,7.884058,366,Summit P4,30.5
5,Soul,16.666667,40.0,13.666667,16.2,19,Combine P2,34.666667
4,SPG,29.6875,18.0,11.65625,8.8,296,Slums P1,34.0
9,XLR8,20.0,16.666667,11.8,7.666667,27,Hacienda P2,33.0
6,WL,17.647059,17.647059,8.705882,7.470588,85,Slums P2,29.5


### Control Share

In [25]:
# Assume each hill is 60 seconds long
HILL_LENGTH = 60  

# Build a per-team tally of "seconds in control"
records = []

for _, row in df_masters.iterrows():
    # 1) Rotation-first holds
    records.append({
        'Team': row['RotateFirst'],
        'ControlSec': row['HoldDuration']
    })
    # 2) Break holds (only if break succeeded)
    records.append({
        'Team': row['BreakTeam'],
        'ControlSec': row['BreakDuration']
    })
    # 3) Scrap holds
    if (row['ScrapTeam'] != 'None') and (row['ScrapTime'] > 0):
        if row['ScrapTeam'] == 'Split':
            # Split means both teams control the scrap
            records.append({
                'Team': row['Team1'],
                'ControlSec': row['ScrapTime'] / 2
            })
            records.append({
                'Team': row['Team2'],
                'ControlSec': row['ScrapTime'] / 2
            })
        else:
            # Single team controls the scrap
            records.append({
                'Team': row['ScrapTeam'],
                'ControlSec': row['ScrapTime']
        })

control_df = pd.DataFrame(records)

# Sum total seconds each team was in control
total_control = control_df.groupby('Team')['ControlSec'].sum()

hills_per_team = pd.concat([df_masters['Team1'], df_masters['Team2']]).value_counts()
total_seconds_per_team = hills_per_team * HILL_LENGTH

# Align and compute control-share % per team
control_share = (total_control / total_seconds_per_team * 100).reset_index()
control_share.columns = ['Team', 'ControlSharePct']

# Sort by control share
control_share = control_share.sort_values('ControlSharePct', ascending=False)

In [26]:
team_stats_df = team_stats_df.merge(control_share, on='Team', how='left')
team_stats_df

Unnamed: 0,Team,RotationWin,BreakSuccess,AvgHoldDuration (s),AvgBreakDuration (s),ScrapPoints,TopHill,TopHillAvgPts,ControlSharePct
0,Wolves,47.169811,47.916667,15.490566,16.458333,494,Slums P2,44.0,34.892739
1,OUG,47.297297,40.298507,17.837838,13.402985,493,Slums P3,36.5,32.121749
2,GodL,48.148148,34.782609,16.055556,12.565217,394,Combine P4,32.5,30.733333
3,XC,66.666667,14.285714,20.666667,6.714286,18,Apocalypse P2,17.333333,21.416667
4,Q9,43.396226,25.0,13.264151,10.235294,448,Hacienda P3,38.0,25.688705
5,XROCK,40.0,21.73913,14.218182,7.884058,366,Summit P4,30.5,22.930108
6,Soul,16.666667,40.0,13.666667,16.2,19,Combine P2,34.666667,27.575758
7,SPG,29.6875,18.0,11.65625,8.8,296,Slums P1,34.0,21.849415
8,XLR8,20.0,16.666667,11.8,7.666667,27,Hacienda P2,33.0,20.0
9,WL,17.647059,17.647059,8.705882,7.470588,85,Slums P2,29.5,17.696078


In [27]:
team_stats_df = team_stats_df[['Team', 'RotationWin', 'BreakSuccess', 
                                 'ScrapPoints','ControlSharePct']]

display(team_stats_df)

Unnamed: 0,Team,RotationWin,BreakSuccess,ScrapPoints,ControlSharePct
0,Wolves,47.169811,47.916667,494,34.892739
1,OUG,47.297297,40.298507,493,32.121749
2,GodL,48.148148,34.782609,394,30.733333
3,XC,66.666667,14.285714,18,21.416667
4,Q9,43.396226,25.0,448,25.688705
5,XROCK,40.0,21.73913,366,22.930108
6,Soul,16.666667,40.0,19,27.575758
7,SPG,29.6875,18.0,296,21.849415
8,XLR8,20.0,16.666667,27,20.0
9,WL,17.647059,17.647059,85,17.696078


### Zero-Point Shutouts

In [30]:
rows = []
for _, r in df.iterrows():
    pts1 = r['Points1']
    pts2 = r['Points2']

    if pts1 == 0:
        rows.append({
            'DominantTeam':     r['Team2'],
            'ZeroScoreTeam':    r['Team1'],
            'Map':              r['Map'],
            'Hill':             r['Hill'],
            'ZeroScoreDuration': r['HoldDuration']
        })
    elif pts2 == 0:
        rows.append({
            'DominantTeam':  r['Team1'],
            'ZeroScoreTeam': r['Team2'],
            'Map':           r['Map'],
            'Hill':          r['Hill'],
            'ZeroScoreDuration': r['HoldDuration']
        })

shutouts = pd.DataFrame(rows)

# Filter for relevant teams
shutouts = shutouts[
    shutouts['ZeroScoreTeam'].isin(relevant_teams) &
    shutouts['DominantTeam'].isin(relevant_teams)
]
shutouts = shutouts.sort_values('ZeroScoreDuration', ascending=False, ignore_index=True)
shutouts[:3]


Unnamed: 0,DominantTeam,ZeroScoreTeam,Map,Hill,ZeroScoreDuration
0,Wolves,OUG,Apocalypse,P4,58
1,XROCK,Q9,Summit,P4,57
2,Wolves,Q9,Slums,P2,56


### Chained-Hills: Momentum Swings

In [31]:
# Build two mini-frames and concat
left = df_masters[['OrigRow','Map','Hill','Team1','Points1','Team2']].rename(
    columns={'Team1':'Team','Points1':'Points','Team2':'Opponent'}
)
right = df_masters[['OrigRow','Map','Hill','Team2','Points2','Team1']].rename(
    columns={'Team2':'Team','Points2':'Points','Team1':'Opponent'}
)
long = pd.concat([left, right], ignore_index=True)

# Sort by match and original play order
long = long.sort_values(['OrigRow'])

# Compute rolling sum of the last 3 hills *per team within each match*
long['Chain4'] = (
    long
      .groupby(['Map','Opponent','Team'])['Points']
      .rolling(window=3, min_periods=3)
      .sum()
      .reset_index(level=[0,1,2], drop=True)
)

# 5) For each team, find the row with its maximum Chain4
best_idx = long.groupby('Team')['Chain4'].idxmax()

# 6) Construct final table including which hills were chained
records = []
for team, i in best_idx.items():
    r = long.loc[i]
    grp = long[
        (long['Team']==team) &
        (long['Map']==r['Map']) &
        (long['Opponent']==r['Opponent'])
    ].sort_values('OrigRow')
    pos = grp.index.get_loc(i)
    hills = grp.iloc[pos-3+1:pos+1]['Hill'].tolist()
    records.append({
        'Team': team,
        'Map': r['Map'],
        'Opponent': r['Opponent'],
        f'Best{3}HillSum': r['Chain4'],
        'Hills': hills
    })

best_chains = pd.DataFrame(records)

# Filter for relevant teams
best_chains = best_chains[
    best_chains['Team'].isin(relevant_teams) &
    best_chains['Opponent'].isin(relevant_teams)
]

best_chains = best_chains.sort_values('Best3HillSum', ascending=False, ignore_index=True)
print(best_chains)

     Team       Map Opponent  Best3HillSum         Hills
0  Wolves     Slums       Q9         137.0  [P2, P3, P4]
1    GodL    Summit      SPG         121.0  [P2, P3, P4]
2     SPG  Hacienda    XROCK         112.0  [P2, P3, P4]


In [32]:
# Map-level stats
# 2. Mixiest hills (avg PossessionChanges)
mixiest = df_masters.groupby(['Map','Hill'])['PossessionChanges'].mean().reset_index()
mixiest = mixiest.sort_values(by='PossessionChanges', ascending=False, ignore_index=True).rename(columns={'PossessionChanges':'AvgPossessionChanges'})

In [33]:
mixiest[:10]

Unnamed: 0,Map,Hill,AvgPossessionChanges
0,Summit,P2,3.366667
1,Summit,P1,3.129032
2,Hacienda,P1,3.038462
3,Combine,P3,2.928571
4,Apocalypse,P1,2.875
5,Slums,P1,2.7
6,Hacienda,P2,2.636364
7,Summit,P4,2.590909
8,Apocalypse,P3,2.545455
9,Combine,P1,2.5


In [35]:
# Save mixiest hills
mixiest.to_csv(f'{OUT_PATH}/mixiest_hills.csv', index=False)

# Save best chains
best_chains.to_csv(f'{OUT_PATH}/best_chains.csv', index=False)

# Save shutouts
shutouts.to_csv(f'{OUT_PATH}/shutouts.csv', index=False)

## Win Predictor Model

In [36]:
rotation_rows = []
for (date, map_, t1, t2), group in df_masters.groupby(['Date', 'Map', 'Team1', 'Team2']):
    # ensure sequential order is preserved
    group = group.reset_index(drop=True)

    # Get scores after first set of hills
    last_row = group[group['Hill'] == 'P4'].head(1)
    if last_row.empty:
        continue

    score1_last = last_row.iloc[0]['Score1']
    score2_last = last_row.iloc[0]['Score2']

    # Determine map winner
    final_row = group[(group['Score1'] == 250) | (group['Score2'] == 250)].head(1)
    if final_row.empty:
        continue

    if final_row.iloc[0]['Score1'] == 250:
        winner = t1
        loser = t2
        target = 1 # from team1's perspective

    else:
        winner = t2
        loser = t1
        target = 0

    # Final Score difference
    if winner == t1:
        score_diff = final_row.iloc[0]['Score1'] - final_row.iloc[0]['Score2']
    else:
        score_diff = final_row.iloc[0]['Score2'] - final_row.iloc[0]['Score1']
    
    rotation_rows.append({
        'Map': map_,
        'Team1': t1,
        'Team2': t2,
        'Score1_P4': score1_last,
        'Score2_P4': score2_last,
        'ScoreDiff_P4': score1_last - score2_last,
        'Winner': winner,
        'Loser': loser,
        'Target_T1': target,
        'FinalScoreDiff': score_diff,
    })

hp_model_df = pd.DataFrame(rotation_rows)


In [37]:
# Calculate match winrate for each team
team_wins = hp_model_df.groupby('Winner').size()
team_total_matches = hp_model_df.groupby('Team1').size() + hp_model_df.groupby('Team2').size()
team_total_matches.dropna(inplace=True)

team_winrate = (team_wins / team_total_matches).reset_index()

display(team_winrate)

Unnamed: 0,index,0
0,GodL,0.6
1,OUG,0.8
2,Q9,0.538462
3,SPG,0.333333
4,Soul,
5,WL,
6,Wolves,0.818182
7,XROCK,0.153846


In [38]:
# Average score difference per winner
avg_score_diff_winner = hp_model_df.groupby('Winner')['FinalScoreDiff'].mean().reset_index()

avg_score_diff_winner = avg_score_diff_winner.rename(columns={'FinalScoreDiff': 'AvgScoreDiff_Winner', 'Winner': 'Team'})

# Average score difference per loser
avg_score_diff_loser = hp_model_df.groupby('Loser')['FinalScoreDiff'].mean().reset_index()
avg_score_diff_loser = avg_score_diff_loser.rename(columns={'FinalScoreDiff': 'AvgScoreDiff_Loser', 'Loser': 'Team'})

# Merge winner and loser averages
avg_score_diff = avg_score_diff_winner.merge(avg_score_diff_loser, on='Team', how='outer').fillna(0)

avg_score_diff['AvgScoreDiff'] = avg_score_diff['AvgScoreDiff_Winner'] - avg_score_diff['AvgScoreDiff_Loser']

avg_score_diff = avg_score_diff[['Team', 'AvgScoreDiff']]

avg_score_diff = avg_score_diff.sort_values('AvgScoreDiff', ascending=False, ignore_index=True)

# team_mask = avg_score_diff['Team'].isin(masters)
# avg_score_diff = avg_score_diff[team_mask]
avg_score_diff

Unnamed: 0,Team,AvgScoreDiff
0,GodL,72.333333
1,OUG,37.25
2,Wolves,31.444444
3,Soul,4.0
4,SPG,-10.25
5,Q9,-12.714286
6,XROCK,-58.590909
7,XLR8,-60.0
8,XC,-106.0
9,WL,-129.25


In [41]:
# team_stats_df = team_stats_df.merge(avg_score_diff, on='Team', how='left')

# Filter for relevant teams
team_mask = team_stats_df['Team'].isin(relevant_teams)
team_stats_df = team_stats_df[team_mask]
display(team_stats_df)

Unnamed: 0,Team,RotationWin,BreakSuccess,ScrapPoints,ControlSharePct,AvgScoreDiff
0,Wolves,47.169811,47.916667,494,34.892739,31.444444
1,OUG,47.297297,40.298507,493,32.121749,37.25
2,GodL,48.148148,34.782609,394,30.733333,72.333333
4,Q9,43.396226,25.0,448,25.688705,-12.714286
5,XROCK,40.0,21.73913,366,22.930108,-58.590909
7,SPG,29.6875,18.0,296,21.849415,-10.25


In [42]:
team_stats_df.to_csv(f'{OUT_PATH}/team_stats.csv', index=False)

In [45]:
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import StandardScaler

# ---------- Config ----------
FEATURE = "ScoreDiff_P4"
TARGET  = "Target_T1"
OUT_DIR = "../Week3/HP/win_predictor"
CURVE_POINTS = 201
SEED = 42
N_BOOT = 1000
os.makedirs(OUT_DIR, exist_ok=True)

df = hp_model_df.dropna(subset=[FEATURE, TARGET]).copy()
df[FEATURE] = df[FEATURE].astype(float)
df[TARGET]  = df[TARGET].astype(int)

scaler = StandardScaler()
X = scaler.fit_transform(df[[FEATURE]].values)
y = df[TARGET].values

model = LogisticRegression(max_iter=1000, solver="lbfgs", random_state=SEED)
model.fit(X, y)

In [47]:
# ---------- Build prediction grid ----------
xmin, xmax = float(df[FEATURE].min()), float(df[FEATURE].max())
pad = max(5.0, 0.1 * (xmax - xmin))
grid = np.linspace(xmin - pad, xmax + pad, CURVE_POINTS).reshape(-1, 1)
grid_scaled = scaler.transform(grid)
p_base = model.predict_proba(grid_scaled)[:, 1]

# ---------- Bootstrap CIs ----------
rng = np.random.default_rng(42)

# Each row = one bootstrap model’s curve over the grid
boot = np.full((N_BOOT, CURVE_POINTS), np.nan, dtype=float)

for i in range(N_BOOT):
    idx = rng.integers(0, len(df), size=len(df))  # sample rows with replacement
    Xb_raw = df[[FEATURE]].values[idx]
    yb     = df[TARGET].values[idx]

    sc = StandardScaler()
    Xb = sc.fit_transform(Xb_raw)
    try:
        m = LogisticRegression(max_iter=1000, solver="lbfgs")
        m.fit(Xb, yb)
        preds = m.predict_proba(sc.transform(grid))[:, 1]  # length = CURVE_POINTS
        boot[i, :] = preds
    except Exception:
        # rare: perfect separation in tiny resamples; leave this row as NaNs
        pass

# 95% CIs across bootstrap runs, per grid point
ci_low  = np.nanpercentile(boot,  2.5, axis=0)   # length = CURVE_POINTS
ci_high = np.nanpercentile(boot, 97.5, axis=0)   # length = CURVE_POINTS

# Sanity check (optional)
assert len(grid.ravel()) == len(p_base) == len(ci_low) == len(ci_high)

curve = pd.DataFrame({
    "ScoreDiff_P4": grid.ravel(),
    "WinProb_Team1": p_base,
    "CI_low": ci_low,
    "CI_high": ci_high
})
curve.to_csv(os.path.join(OUT_DIR, "hp_curve.csv"), index=False)

print(f"Saved: {os.path.join(OUT_DIR, 'hp_curve.csv')}")

Saved: ../Week3/HP/win_predictor/hp_curve.csv


In [48]:
# ---------- Cross-validated metrics ----------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
aucs, briers = [], []
for tr, te in skf.split(X, y):
    m = LogisticRegression(max_iter=1000, solver="lbfgs", random_state=SEED)
    m.fit(X[tr], y[tr])
    p = m.predict_proba(X[te])[:, 1]
    aucs.append(roc_auc_score(y[te], p))
    briers.append(brier_score_loss(y[te], p))

metrics = {
    "n_matches": int(len(df)),
    "AUC_mean": float(np.mean(aucs)),
    "AUC_std": float(np.std(aucs)),
    "Brier_mean": float(np.mean(briers)),
    "Brier_std": float(np.std(briers)),
}
pd.DataFrame([metrics]).to_csv(os.path.join(OUT_DIR, "metrics_cv.csv"), index=False)