# Features

In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

# Load data
data = pd.read_csv("../data/match-csvs/Point_Visuals_RudyQuan_EdwardWinter.csv")

# Server flag
data['server_is_player1'] = (data['serverName'] == data['player1Name']).astype(int)

# Convert pointScore to numeric
point_map = {'0':0,'15':1,'30':2,'40':3,'Ad':4}
def point_to_numeric(ps):
    try:
        s,r = ps.split('-')
        return point_map.get(s,0), point_map.get(r,0)
    except:
        return 0,0
data[['server_points','returner_points']] = data['pointScore'].apply(lambda x: pd.Series(point_to_numeric(x)))

player1_points = []
player2_points = []
for _, row in data.iterrows():
    if row['server_is_player1'] == 1:
        player1_points.append(row['server_points'])
        player2_points.append(row['returner_points'])
    else:
        player1_points.append(row['returner_points'])
        player2_points.append(row['server_points'])
data['player1_points'] = player1_points
data['player2_points'] = player2_points

def score_numeric(set_score):
    try:
        s,r = set_score.split('-')
        return int(s), int(r)
    except:
        return 0,0
data[['player1_sets','player2_sets']] = data['setScore'].apply(lambda x: pd.Series(score_numeric(x)))
data[['player1_games','player2_games']] = data['gameScore'].apply(lambda x: pd.Series(score_numeric(x)))

data['rallyCount'] = pd.to_numeric(data['rallyCount'], errors='coerce').fillna(0)
data['is_break_point'] = data['isBreakPoint'].astype(int)

# Target
data['point_won'] = (data['pointWonBy'] == data['player1Name']).astype(int)

# Features for WP model
features = ['player1_sets','player2_sets', 
            'player1_games','player2_games',
            'player1_points','player2_points',
            'is_break_point', 
            'server_is_player1']


# Win Probability Model (in player1's pov)

In [13]:
import xgboost as xgb
from xgboost import XGBClassifier
X = data[features]
y = data['point_won']

wp_model = XGBClassifier(learning_rate=0.1, 
                         max_depth=3, 
                         min_child_weight=5, 
                         n_estimators=200, 
                         subsample=1.0, 
                         colsample_bytree=1.0, 
                         random_state=42)
wp_model.fit(X, y)


### Picking Model

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np

X = data[features]
y = data['point_won']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

xgb_model = XGBClassifier(
    random_state=42
)

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.7, 0.9, 1.0],
    'min_child_weight': [1, 3, 5]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))



Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.7}
Accuracy: 0.41379310344827586
ROC-AUC: 0.43137254901960786


Parameters: { "eval" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# Leverage Index Model

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def simulate_match_from_state(model, state, features, n_sims, max_points=200):
    wins = 0
    cache = {}

    for _ in range(n_sims):
        s = state.copy()
        p1_sets, p2_sets = s['player1_sets'], s['player2_sets']
        p1_games, p2_games = s['player1_games'], s['player2_games']
        p1_pts, p2_pts = s['player1_points'], s['player2_points']
        server = s['server_is_player1']
        break_point = s['is_break_point']

        for _ in range(max_points):
            # Stops if match is won
            if (p1_sets == 2 or p2_sets == 2 and abs(p1_sets - p2_sets) == 2) or (p1_sets > 2 or p2_sets > 2):
                break

            # Cache key based on state
                # Remembers predictions from same state
            key = (p1_sets, p2_sets, p1_games, p2_games, p1_pts, p2_pts, server)
            if key not in cache:
                df = pd.DataFrame([{
                    'player1_sets': p1_sets,
                    'player2_sets': p2_sets,
                    'player1_games': p1_games,
                    'player2_games': p2_games,
                    'player1_points': p1_pts,
                    'player2_points': p2_pts,
                    'is_break_point': break_point,
                    'server_is_player1': server
                }])
                cache[key] = model.predict_proba(df[features])[0, 1]
            p_point = cache[key]

            point_won = np.random.rand() < p_point

            # Update point score
            if point_won:
                p1_pts += 1
            else:
                p2_pts += 1

            # Updates game score, if necessary
            if p1_pts >= 4:
                p1_games += 1
                p1_pts = p2_pts = 0
                server = 1 - server
                break_point = 0
            elif p2_pts >= 4:
                p2_games += 1
                p1_pts = p2_pts = 0
                server = 1 - server
                break_point = 0
            elif (p1_pts == 3 and p2_pts < 3 and server == 1) or (p1_pts < 3 and p2_pts == 3 and server == 0):
                break_point = 1

            # Updates set score, if won
            if (p1_games >= 6 and p1_games - p2_games >= 2) or (p1_games > 6):
                p1_sets += 1
                p1_games = p2_games = 0
            elif (p2_games >= 6 and p2_games - p1_games >= 2) or (p2_games > 6):
                p2_sets += 1
                p1_games = p2_games = 0

        if p1_sets > p2_sets:
            wins += 1
    # Returns the probability of winning the match, based on simulated average
    return wins / n_sims


def sequential_match_simulator_player1_prior(df, wp_model, features, n_sims=50):
    match_wp_list = []
    leverage_list = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Simulating match probabilities"):
        base = {
            'player1_sets': row['player1_sets'],
            'player2_sets': row['player2_sets'],
            'player1_games': row['player1_games'],
            'player2_games': row['player2_games'],
            'player1_points': row['player1_points'],
            'player2_points': row['player2_points'],
            'is_break_point': row['is_break_point'],
            'server_is_player1': row['server_is_player1']
        }

        # If player1 wins next point
        win_feat = base.copy()
        win_feat['player1_points'] += 1
        if win_feat['player1_points'] > 3: 
            win_feat['player1_points'] = 0 
            win_feat['player2_points'] = 0 
            win_feat['player1_games'] += 1 
            if win_feat['player1_games'] > 6: 
                win_feat['player1_games'] = 0 
                win_feat['player2_games'] = 0 
                win_feat['player1_sets'] += 1
        wp_after_win = simulate_match_from_state(wp_model, win_feat, features, n_sims=n_sims)

        # If player1 loses next point
        lose_feat = base.copy()
        lose_feat['player2_points'] += 1
        if lose_feat['player2_points'] > 3: 
            lose_feat['player1_points'] = 0 
            lose_feat['player2_points'] = 0 
            lose_feat['player2_games'] += 1 
            if lose_feat['player2_games'] > 6: 
                lose_feat['player1_games'] = 0 
                lose_feat['player2_games'] = 0 
                lose_feat['player2_sets'] += 1
        wp_after_loss = simulate_match_from_state(wp_model, lose_feat, features, n_sims=n_sims)

        # Calculates Leverage
        leverage = abs(wp_after_win - wp_after_loss) * 100
        leverage_list.append(leverage)

        # Win probability of base state
        match_wp = simulate_match_from_state(wp_model, base.copy(), features, n_sims=n_sims)
        match_wp_list.append(match_wp)

    df['match_wp_player1'] = match_wp_list
    df['match_leverage_player1'] = leverage_list
    return df


In [None]:
data = sequential_match_simulator_player1_prior(data, wp_model, features)

Simulating match probabilities: 100%|██████████| 144/144 [25:50<00:00, 10.76s/it] 


## Check Results

In [None]:
data.head(70)

Unnamed: 0,pointNumber,player1Name,player2Name,pointScore,gameScore,setScore,tiebreakScore,side,serverName,returnerName,...,player2_points,player1_sets,player2_sets,player1_games,player2_games,is_break_point,point_won,momentum_player1,match_wp_player1,match_leverage_player1
0,1,Rudy Quan,Edward Winter,0-0,0-0,0-0,,Ad,Edward Winter,Rudy Quan,...,0,0,0,0,0,0,0,-1,0.20,0.04
1,2,Rudy Quan,Edward Winter,15-0,0-0,0-0,,Ad,Edward Winter,Rudy Quan,...,1,0,0,0,0,0,0,-2,0.30,0.20
2,3,Rudy Quan,Edward Winter,30-0,0-0,0-0,,Deuce,Edward Winter,Rudy Quan,...,2,0,0,0,0,0,0,-3,0.20,0.04
3,4,Rudy Quan,Edward Winter,40-0,0-0,0-0,,Ad,Edward Winter,Rudy Quan,...,3,0,0,0,0,0,0,-4,0.20,0.02
4,5,Rudy Quan,Edward Winter,0-0,0-1,0-0,,Deuce,Rudy Quan,Edward Winter,...,0,0,0,0,1,0,1,1,0.08,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,66,Rudy Quan,Edward Winter,15-15,0-1,0-1,,Deuce,Rudy Quan,Edward Winter,...,1,0,1,0,1,0,0,-2,0.00,0.00
66,67,Rudy Quan,Edward Winter,15-30,0-1,0-1,,Ad,Rudy Quan,Edward Winter,...,2,0,1,0,1,0,0,-3,0.00,0.00
67,68,Rudy Quan,Edward Winter,15-40,0-1,0-1,,Deuce,Rudy Quan,Edward Winter,...,3,0,1,0,1,1,1,1,0.00,0.00
68,69,Rudy Quan,Edward Winter,30-40,0-1,0-1,,Ad,Rudy Quan,Edward Winter,...,3,0,1,0,1,1,1,2,0.00,0.00


In [None]:
data[['Name', 'is_break_point', 'match_wp_player1', 'match_leverage_player1']].head(60)

Unnamed: 0,Name,is_break_point,match_wp_player1,match_leverage_player1
0,"Set 1: 0-0, 0-0 Edward Winter Serving",0,0.16,0.08
1,"Set 1: 0-0, 15-0 Edward Winter Serving",0,0.28,0.1
2,"Set 1: 0-0, 30-0 Edward Winter Serving",0,0.22,0.08
3,"Set 1: 0-0, 40-0 Edward Winter Serving",0,0.24,0.16
4,"Set 1: 0-1, 0-0 Rudy Quan Serving",0,0.32,0.02
5,"Set 1: 0-1, 15-0 Rudy Quan Serving",0,0.24,0.14
6,"Set 1: 0-1, 15-15 Rudy Quan Serving",0,0.3,0.14
7,"Set 1: 0-1, 30-15 Rudy Quan Serving",0,0.18,0.04
8,"Set 1: 0-1, 30-30 Rudy Quan Serving",0,0.22,0.06
9,"Set 1: 0-1, 40-30 Rudy Quan Serving",0,0.18,0.14
