In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_excel('../Week2/HP/hp.xlsx')

# Parse Score into numeric columns
df[['Score1', 'Score2']] = df['Score'].str.split('-', expand=True).astype(int)

# Compute points per hill for each team
df['Points1'] = df.groupby(['Date','Map','Team1','Team2'])['Score1'].diff().fillna(df['Score1'])
df['Points2'] = df.groupby(['Date','Map','Team1','Team2'])['Score2'].diff().fillna(df['Score2'])

# Keep the original row order in a column
df = df.reset_index().rename(columns={'index':'OrigRow'})

# # Separate Masters and Challenger teams
# masters = ['XROCK', 'SPG', 'OUG', 'Wolves', 'GodL', 'Q9']
# df_masters = df[df['Team1'].isin(masters) | df['Team2'].isin(masters)]
df_masters = df.copy()

# Prepare long format for points
long_pts = pd.concat([
    df_masters[['Map','Hill','Team1','Points1']].rename(columns={'Team1':'Team','Points1':'Points'}),
    df_masters[['Map','Hill','Team2','Points2']].rename(columns={'Team2':'Team','Points2':'Points'})
])

long_pts.head()

Unnamed: 0,Map,Hill,Team,Points
0,Summit,P1,OUG,13.0
1,Summit,P2,OUG,18.0
2,Summit,P3,OUG,39.0
3,Summit,P4,OUG,19.0
4,Summit,P1,OUG,7.0


## Team-Level Stats

In [3]:
# List of all teams
teams = sorted(set(long_pts['Team']))

# Team-level stats
team_stats = []

for team in teams:
    played = df_masters[(df_masters['Team1']==team) | (df_masters['Team2']==team)]
    # Rotation-win %
    rf = played[played['RotateFirst']==team]
    rot_rate = (rf['RotationWin']=='Yes').mean() * 100 if not rf.empty else np.nan
    # Break-success %
    opp = played[played['RotateFirst']!=team]
    breaks = played[(played['BreakTeam']==team) & (played['BreakSuccess']=='Yes')]
    break_rate = len(breaks) / len(opp) * 100 if len(opp)>0 else np.nan
    # Avg durations
    avg_hold = rf['HoldDuration'].mean()
    avg_break_hold = played[played['BreakTeam']==team]['BreakDuration'].mean()
    # Scrap points
    scrap_pts = df_masters[df_masters['ScrapTeam']==team]['ScrapTime'].sum()
    # Control-Share%
    
    team_stats.append({
        'Team': team,
        'RotationWin': rot_rate,
        'BreakSuccess': break_rate,
        'AvgHoldDuration (s)': avg_hold,
        'AvgBreakDuration (s)': avg_break_hold,
        'ScrapPoints': scrap_pts,
    })

team_stats_df = pd.DataFrame(team_stats).set_index('Team')

In [31]:
# Most dominant hills
team_hill_means = long_pts.groupby(['Team', 'Map', 'Hill'])['Points'].mean().reset_index()
top_hills = (
    team_hill_means
    .sort_values(['Team', 'Points'], ascending=[True, False])
    .groupby('Team', as_index=False)
    .first()
)
top_hills['TopHill'] = top_hills['Map'] + ' ' + top_hills['Hill']
top_hills = top_hills[['Team', 'TopHill', 'Points']].rename(columns={'Points': 'TopHillAvgPts'})


In [32]:
# merge team stats with top hills
team_stats_df = team_stats_df.merge(top_hills, on='Team', how='left')

# Calculate distance from begin perfect at rotation-win and break-success
team_stats_df['RotBreakDist'] = np.sqrt((team_stats_df['RotationWin'] - 100)**2 + (team_stats_df['BreakSuccess'] - 100)**2)
team_stats_df = team_stats_df.sort_values('RotBreakDist')
team_stats_df = team_stats_df.drop(columns='RotBreakDist')

In [33]:
team_stats_df

Unnamed: 0,Team,RotationWin,BreakSuccess,AvgHoldDuration (s),AvgBreakDuration (s),ScrapPoints,TopHill,TopHillAvgPts
8,Wolves,42.5,47.619048,14.6,16.214286,388,Slums P2,44.0
6,Soul,40.0,46.666667,14.6,16.733333,101,Summit P2,31.5
3,OUG,40.0,45.238095,15.418182,13.857143,366,Summit P3,35.5
2,GodL,45.238095,35.897436,15.571429,13.564103,315,Hacienda P3,35.4
4,Q9,42.222222,28.070175,12.577778,10.491228,378,Hacienda P3,38.0
10,XROCK,41.463415,20.967742,14.682927,7.83871,297,Summit P3,37.75
1,DVS,46.666667,7.692308,19.333333,4.692308,76,Combine P4,38.0
5,SPG,27.868852,19.047619,10.819672,9.5,254,Slums P1,34.0
7,WL,22.222222,22.222222,6.222222,9.111111,35,Summit P4,26.5
0,AG,50.0,0.0,11.833333,3.0,29,Combine P2,26.0


### Control Share

In [34]:
# Assume each hill is 60 seconds long
HILL_LENGTH = 60  

# Build a per-team tally of "seconds in control"
records = []

for _, row in df_masters.iterrows():
    # 1) Rotation-first holds
    records.append({
        'Team': row['RotateFirst'],
        'ControlSec': row['HoldDuration']
    })
    # 2) Break holds (only if break succeeded)
    records.append({
        'Team': row['BreakTeam'],
        'ControlSec': row['BreakDuration']
    })
    # 3) Scrap holds
    if (row['ScrapTeam'] != 'None') and (row['ScrapTime'] > 0):
        if row['ScrapTeam'] == 'Split':
            # Split means both teams control the scrap
            records.append({
                'Team': row['Team1'],
                'ControlSec': row['ScrapTime'] / 2
            })
            records.append({
                'Team': row['Team2'],
                'ControlSec': row['ScrapTime'] / 2
            })
        else:
            # Single team controls the scrap
            records.append({
                'Team': row['ScrapTeam'],
                'ControlSec': row['ScrapTime']
        })

control_df = pd.DataFrame(records)

# Sum total seconds each team was in control
total_control = control_df.groupby('Team')['ControlSec'].sum()

hills_per_team = pd.concat([df_masters['Team1'], df_masters['Team2']]).value_counts()
total_seconds_per_team = hills_per_team * HILL_LENGTH

# Align and compute control-share % per team
control_share = (total_control / total_seconds_per_team * 100).reset_index()
control_share.columns = ['Team', 'ControlSharePct']

# Sort by control share
control_share = control_share.sort_values('ControlSharePct', ascending=False)

In [35]:
team_stats_df = team_stats_df.merge(control_share, on='Team', how='left')
team_stats_df

Unnamed: 0,Team,RotationWin,BreakSuccess,AvgHoldDuration (s),AvgBreakDuration (s),ScrapPoints,TopHill,TopHillAvgPts,ControlSharePct
0,Wolves,42.5,47.619048,14.6,16.214286,388,Slums P2,44.0,33.79065
1,Soul,40.0,46.666667,14.6,16.733333,101,Summit P2,31.5,31.833333
2,OUG,40.0,45.238095,15.418182,13.857143,366,Summit P3,35.5,30.97079
3,GodL,45.238095,35.897436,15.571429,13.564103,315,Hacienda P3,35.4,30.895062
4,Q9,42.222222,28.070175,12.577778,10.491228,378,Hacienda P3,38.0,25.457516
5,XROCK,41.463415,20.967742,14.682927,7.83871,297,Summit P3,37.75,22.605178
6,DVS,46.666667,7.692308,19.333333,4.692308,76,Combine P4,38.0,25.416667
7,SPG,27.868852,19.047619,10.819672,9.5,254,Slums P1,34.0,21.44822
8,WL,22.222222,22.222222,6.222222,9.111111,35,Summit P4,26.5,16.111111
9,AG,50.0,0.0,11.833333,3.0,29,Combine P2,26.0,19.0


In [36]:
team_stats_df = team_stats_df[['Team', 'RotationWin', 'BreakSuccess', 
                                 'ScrapPoints','ControlSharePct']]

# team_mask = team_stats_df['Team'].isin(masters)

# team_stats_df = team_stats_df[team_mask]


display(team_stats_df)

Unnamed: 0,Team,RotationWin,BreakSuccess,ScrapPoints,ControlSharePct
0,Wolves,42.5,47.619048,388,33.79065
1,Soul,40.0,46.666667,101,31.833333
2,OUG,40.0,45.238095,366,30.97079
3,GodL,45.238095,35.897436,315,30.895062
4,Q9,42.222222,28.070175,378,25.457516
5,XROCK,41.463415,20.967742,297,22.605178
6,DVS,46.666667,7.692308,76,25.416667
7,SPG,27.868852,19.047619,254,21.44822
8,WL,22.222222,22.222222,35,16.111111
9,AG,50.0,0.0,29,19.0


In [55]:
team_stats_df.to_csv('team_stats.csv')

### Zero-Point Shutouts

In [37]:
rows = []
for _, r in df.iterrows():
    pts1 = r['Points1']
    pts2 = r['Points2']

    if pts1 == 0:
        rows.append({
            'DominantTeam':     r['Team2'],
            'ZeroScoreTeam':    r['Team1'],
            'Map':              r['Map'],
            'Hill':             r['Hill'],
            'ZeroScoreDuration': r['HoldDuration']
        })
    elif pts2 == 0:
        rows.append({
            'DominantTeam':  r['Team1'],
            'ZeroScoreTeam': r['Team2'],
            'Map':           r['Map'],
            'Hill':          r['Hill'],
            'ZeroScoreDuration': r['HoldDuration']
        })

shutouts = pd.DataFrame(rows)
shutouts = shutouts.sort_values('ZeroScoreDuration', ascending=False, ignore_index=True)
shutouts[:3]


Unnamed: 0,DominantTeam,ZeroScoreTeam,Map,Hill,ZeroScoreDuration
0,DVS,Soul,Summit,P4,58
1,XROCK,WL,Summit,P3,57
2,Wolves,Q9,Slums,P2,56


### Fastest Breaks

In [38]:
# Filter only hills where a break succeeded
breaks = df_masters[df_masters['BreakSuccess'] == 'Yes']

# The flip happens at the end of HoldDuration
fastest = breaks.nsmallest(10, 'HoldDuration')  \
    [['Map','Hill','RotateFirst','BreakTeam','HoldDuration']]

print(fastest)

            Map Hill RotateFirst BreakTeam  HoldDuration
0        Summit   P1         OUG    Wolves             1
43       Summit   P3         SPG    Wolves             1
51     Hacienda   P3      Wolves       SPG             1
59   Apocalypse   P4       XROCK      GodL             1
64   Apocalypse   P1       XROCK      GodL             1
127     Combine   P4         OUG        Q9             1
131  Apocalypse   P1         OUG        Q9             1
232  Apocalypse   P3          WL        Q9             1
248      Summit   P1          WL     XROCK             1
257  Apocalypse   P4      Wolves       SPG             1


### Chained-Hills: Momentum Swings

In [39]:
# Build two mini-frames and concat
left = df_masters[['OrigRow','Map','Hill','Team1','Points1','Team2']].rename(
    columns={'Team1':'Team','Points1':'Points','Team2':'Opponent'}
)
right = df_masters[['OrigRow','Map','Hill','Team2','Points2','Team1']].rename(
    columns={'Team2':'Team','Points2':'Points','Team1':'Opponent'}
)
long = pd.concat([left, right], ignore_index=True)

# Sort by match and original play order
long = long.sort_values(['OrigRow'])

# Compute rolling sum of the last 3 hills *per team within each match*
long['Chain4'] = (
    long
      .groupby(['Map','Opponent','Team'])['Points']
      .rolling(window=3, min_periods=3)
      .sum()
      .reset_index(level=[0,1,2], drop=True)
)

# 5) For each team, find the row with its maximum Chain4
best_idx = long.groupby('Team')['Chain4'].idxmax()

# 6) Construct final table including which hills were chained
records = []
for team, i in best_idx.items():
    r = long.loc[i]
    grp = long[
        (long['Team']==team) &
        (long['Map']==r['Map']) &
        (long['Opponent']==r['Opponent'])
    ].sort_values('OrigRow')
    pos = grp.index.get_loc(i)
    hills = grp.iloc[pos-3+1:pos+1]['Hill'].tolist()
    records.append({
        'Team': team,
        'Map': r['Map'],
        'Opponent': r['Opponent'],
        f'Best{3}HillSum': r['Chain4'],
        'Hills': hills
    })

best_chains = pd.DataFrame(records)
best_chains = best_chains.sort_values('Best3HillSum', ascending=False, ignore_index=True)
print(best_chains)

      Team         Map Opponent  Best3HillSum         Hills
0   Wolves       Slums       Q9         137.0  [P2, P3, P4]
1      OUG  Apocalypse       Q9         130.0  [P1, P2, P3]
2       Q9  Apocalypse       WL         122.0  [P2, P3, P4]
3     GodL      Summit      SPG         121.0  [P2, P3, P4]
4      SPG    Hacienda    XROCK         112.0  [P2, P3, P4]
5      DVS      Summit    Xceed         111.0  [P2, P3, P4]
6     Soul      Summit      DVS         105.0  [P3, P4, P1]
7    XROCK      Summit       WL         105.0  [P2, P3, P4]
8     XLR8    Hacienda     GodL          96.0  [P4, P1, P2]
9       WL      Summit    XROCK          75.0  [P4, P1, P2]
10      AG     Combine     Soul          71.0  [P2, P3, P4]
11   Xceed      Summit      DVS          64.0  [P1, P2, P3]


In [40]:
# Map-level stats
# 2. Mixiest hills (avg PossessionChanges)
mixiest = df_masters.groupby(['Map','Hill'])['PossessionChanges'].mean().reset_index()
mixiest = mixiest.sort_values(by='PossessionChanges', ascending=False, ignore_index=True).rename(columns={'PossessionChanges':'AvgPossessionChanges'})

In [41]:
mixiest[:10]

Unnamed: 0,Map,Hill,AvgPossessionChanges
0,Summit,P2,3.52
1,Hacienda,P1,3.285714
2,Summit,P1,3.0
3,Combine,P3,2.888889
4,Slums,P1,2.857143
5,Summit,P4,2.75
6,Apocalypse,P3,2.75
7,Combine,P1,2.708333
8,Apocalypse,P1,2.521739
9,Combine,P2,2.5


In [14]:
# Save stats
team_stats_df = team_stats_df.round(2)
team_stats_df.to_csv('../Week2/HP/team_stats.csv')

# Save mixiest hills
mixiest.to_csv('../Week2/HP/mixiest_hills.csv', index=False)

# Save best chains
best_chains.to_csv('../Week2/HP/best_chains.csv', index=False)

# Save shutouts
shutouts.to_csv('../Week2/HP/shutouts.csv', index=False)

# Save fastest breaks
fastest.to_csv('../Week2/HP/fastest_breaks.csv', index=False)

## Win Predictor Model

In [47]:
rotation_rows = []
for (date, map_, t1, t2), group in df_masters.groupby(['Date', 'Map', 'Team1', 'Team2']):
    # ensure sequential order is preserved
    group = group.reset_index(drop=True)

    # Get scores after first set of hills
    last_row = group[group['Hill'] == 'P4'].head(1)
    if last_row.empty:
        continue

    score1_last = last_row.iloc[0]['Score1']
    score2_last = last_row.iloc[0]['Score2']

    # Determine map winner
    final_row = group[(group['Score1'] == 250) | (group['Score2'] == 250)].head(1)
    if final_row.empty:
        continue

    if final_row.iloc[0]['Score1'] == 250:
        winner = t1
        loser = t2
        target = 1 # from team1's perspective

    else:
        winner = t2
        loser = t1
        target = 0

    # Final Score difference
    if winner == t1:
        score_diff = final_row.iloc[0]['Score1'] - final_row.iloc[0]['Score2']
    else:
        score_diff = final_row.iloc[0]['Score2'] - final_row.iloc[0]['Score1']
    
    rotation_rows.append({
        'Map': map_,
        'Team1': t1,
        'Team2': t2,
        'Score1_P4': score1_last,
        'Score2_P4': score2_last,
        'ScoreDiff_P4': score1_last - score2_last,
        'Winner': winner,
        'Loser': loser,
        'Target_T1': target,
        'FinalScoreDiff': score_diff,
    })

hp_model_df = pd.DataFrame(rotation_rows)


In [71]:
# Calculate match winrate for each team
team_wins = hp_model_df.groupby('Winner').size()
team_total_matches = hp_model_df.groupby('Team1').size() + hp_model_df.groupby('Team2').size()
team_total_matches.dropna(inplace=True)

team_winrate = (team_wins / team_total_matches).reset_index()

display(team_winrate)

Unnamed: 0,index,0
0,DVS,0.666667
1,GodL,0.5
2,OUG,0.9
3,Q9,0.545455
4,SPG,0.363636
5,Soul,1.0
6,WL,
7,Wolves,0.777778
8,XROCK,0.090909


In [72]:
team_wins

Winner
DVS       2
GodL      4
OUG       9
Q9        6
SPG       4
Soul      3
Wolves    7
XROCK     1
dtype: int64

In [43]:
# Average score difference per winner
avg_score_diff_winner = hp_model_df.groupby('Winner')['FinalScoreDiff'].mean().reset_index()

avg_score_diff_winner = avg_score_diff_winner.rename(columns={'FinalScoreDiff': 'AvgScoreDiff_Winner', 'Winner': 'Team'})

# Average score difference per loser
avg_score_diff_loser = hp_model_df.groupby('Loser')['FinalScoreDiff'].mean().reset_index()
avg_score_diff_loser = avg_score_diff_loser.rename(columns={'FinalScoreDiff': 'AvgScoreDiff_Loser', 'Loser': 'Team'})

# Merge winner and loser averages
avg_score_diff = avg_score_diff_winner.merge(avg_score_diff_loser, on='Team', how='outer').fillna(0)

avg_score_diff['AvgScoreDiff'] = avg_score_diff['AvgScoreDiff_Winner'] - avg_score_diff['AvgScoreDiff_Loser']

avg_score_diff = avg_score_diff[['Team', 'AvgScoreDiff']]

avg_score_diff = avg_score_diff.sort_values('AvgScoreDiff', ascending=False, ignore_index=True)

# team_mask = avg_score_diff['Team'].isin(masters)
# avg_score_diff = avg_score_diff[team_mask]
avg_score_diff

Unnamed: 0,Team,AvgScoreDiff
0,Soul,59.333333
1,GodL,58.75
2,Wolves,44.857143
3,OUG,28.777778
4,DVS,-12.5
5,Q9,-15.6
6,SPG,-22.035714
7,XROCK,-44.2
8,XLR8,-72.0
9,AG,-75.0


In [46]:
# team_stats_df = team_stats_df.merge(avg_score_diff, on='Team', how='left')

display(team_stats_df)

Unnamed: 0,Team,RotationWin,BreakSuccess,ScrapPoints,ControlSharePct,AvgScoreDiff
0,Wolves,42.5,47.619048,388,33.79065,44.857143
1,Soul,40.0,46.666667,101,31.833333,59.333333
2,OUG,40.0,45.238095,366,30.97079,28.777778
3,GodL,45.238095,35.897436,315,30.895062,58.75
4,Q9,42.222222,28.070175,378,25.457516,-15.6
5,XROCK,41.463415,20.967742,297,22.605178,-44.2
6,DVS,46.666667,7.692308,76,25.416667,-12.5
7,SPG,27.868852,19.047619,254,21.44822,-22.035714
8,WL,22.222222,22.222222,35,16.111111,-123.5
9,AG,50.0,0.0,29,19.0,-75.0


In [45]:
team_stats_df.to_csv('../rankings/hp_team_stats.csv', index=False)

In [54]:
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import StandardScaler

# ---------- Config ----------
FEATURE = "ScoreDiff_P4"
TARGET  = "Target_T1"
OUT_DIR = "../Week2/HP/win_predictor_p4"
CURVE_POINTS = 201
SEED = 42
N_BOOT = 1000
os.makedirs(OUT_DIR, exist_ok=True)

df = hp_model_df.dropna(subset=[FEATURE, TARGET]).copy()
df[FEATURE] = df[FEATURE].astype(float)
df[TARGET]  = df[TARGET].astype(int)

scaler = StandardScaler()
X = scaler.fit_transform(df[[FEATURE]].values)
y = df[TARGET].values

model = LogisticRegression(max_iter=1000, solver="lbfgs", random_state=SEED)
model.fit(X, y)

In [55]:
# ---------- Build prediction grid ----------
xmin, xmax = float(df[FEATURE].min()), float(df[FEATURE].max())
pad = max(5.0, 0.1 * (xmax - xmin))
grid = np.linspace(xmin - pad, xmax + pad, CURVE_POINTS).reshape(-1, 1)
grid_scaled = scaler.transform(grid)
p_base = model.predict_proba(grid_scaled)[:, 1]

# ---------- Bootstrap CIs ----------
rng = np.random.default_rng(42)

# Each row = one bootstrap model’s curve over the grid
boot = np.full((N_BOOT, CURVE_POINTS), np.nan, dtype=float)

for i in range(N_BOOT):
    idx = rng.integers(0, len(df), size=len(df))  # sample rows with replacement
    Xb_raw = df[[FEATURE]].values[idx]
    yb     = df[TARGET].values[idx]

    sc = StandardScaler()
    Xb = sc.fit_transform(Xb_raw)
    try:
        m = LogisticRegression(max_iter=1000, solver="lbfgs")
        m.fit(Xb, yb)
        preds = m.predict_proba(sc.transform(grid))[:, 1]  # length = CURVE_POINTS
        boot[i, :] = preds
    except Exception:
        # rare: perfect separation in tiny resamples; leave this row as NaNs
        pass

# 95% CIs across bootstrap runs, per grid point
ci_low  = np.nanpercentile(boot,  2.5, axis=0)   # length = CURVE_POINTS
ci_high = np.nanpercentile(boot, 97.5, axis=0)   # length = CURVE_POINTS

# Sanity check (optional)
assert len(grid.ravel()) == len(p_base) == len(ci_low) == len(ci_high)

curve = pd.DataFrame({
    "ScoreDiff_P4": grid.ravel(),
    "WinProb_Team1": p_base,
    "CI_low": ci_low,
    "CI_high": ci_high
})
curve.to_csv(os.path.join(OUT_DIR, "curve_p4_scorediff_winprob_ci.csv"), index=False)

print(f"Saved: {os.path.join(OUT_DIR, 'curve_p4_scorediff_winprob_ci.csv')}")

Saved: ../Week2/HP/win_predictor_p4/curve_p4_scorediff_winprob_ci.csv


In [49]:
# ---------- Cross-validated metrics ----------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
aucs, briers = [], []
for tr, te in skf.split(X, y):
    m = LogisticRegression(max_iter=1000, solver="lbfgs", random_state=SEED)
    m.fit(X[tr], y[tr])
    p = m.predict_proba(X[te])[:, 1]
    aucs.append(roc_auc_score(y[te], p))
    briers.append(brier_score_loss(y[te], p))

metrics = {
    "n_matches": int(len(df)),
    "AUC_mean": float(np.mean(aucs)),
    "AUC_std": float(np.std(aucs)),
    "Brier_mean": float(np.mean(briers)),
    "Brier_std": float(np.std(briers)),
}
pd.DataFrame([metrics]).to_csv(os.path.join(OUT_DIR, "metrics_cv.csv"), index=False)

In [50]:
# ---------- Calibration sample (optional quick check) ----------
# Produces a small table you can plot elsewhere if you want
prob = model.predict_proba(X)[:, 1]
frac_pos, mean_prob = calibration_curve(y, prob, n_bins=10, strategy="uniform")
pd.DataFrame({"mean_pred": mean_prob, "frac_positive": frac_pos}).to_csv(
    os.path.join(OUT_DIR, "calibration_bins.csv"), index=False
)

In [60]:
hp_model_df.groupby('Map').agg(
    t1_wins = ('Target_T1', 'sum'),
    n_maps  = ('Target_T1', 'count'),
)

Unnamed: 0_level_0,t1_wins,n_maps
Map,Unnamed: 1_level_1,Unnamed: 2_level_1
Apocalypse,2,5
Combine,3,5
Hacienda,3,4
Slums,0,2
Summit,1,6
