Goal: predict one player's chance to win


Potential issues:
1. Data is NOT iid - correlated over time and by player
2. Must generalize across time - split data chronologically, not random splits (data leakage)
    train + val: 2010-2021 - stacked
    test: 2022 - 2024

train/val multiples

3. Must drop all match statistics features for current match, because will only have access to data up until day of for any predictions - can calculate some rolling measures based on the last N matches. 
    should include some more prev year data for rolling measure calcs
4. Need to implement symmetric feature design, otherwise model learns "winner always wins"

In [1]:
# packages
import os
import re
import pandas as pd
import glob
import numpy as np
from dotenv import load_dotenv

# paths
load_dotenv()
root = os.getenv('root')
data = os.path.join(root, 'data')
processed = os.path.join(data, 'processed')

In [2]:
# load match files
match_files = glob.glob(os.path.join(data, '*matches*.csv'))

dfs = []

for file in match_files:
    df = pd.read_csv(file)
    dfs.append(df)

if dfs:
    df = pd.concat(dfs, ignore_index=True)

In [3]:
# RR is a design choice - unable to calculate court time before the elim stage for Tour Finals since RR are not ordered
round_list = ['R128', 'R64', 'R32', 'R16', 'RR','QF', 'SF', 'BR', 'F']

df['round'] = pd.Categorical(df['round'], categories=round_list, ordered=True)
df_sorted = df.sort_values(by=['tourney_date', 'round'])

In [4]:
# convert score to games won
df_sorted['score'] = df_sorted['score'].apply(lambda x: x.strip())
prefix = 'set_score'
set_scores = df_sorted['score'].str.split(' ', expand = True)
set_scores = set_scores.add_prefix(prefix)

set_scores = set_scores.drop('set_score5', axis=1) # only RET string
set_scores_games = pd.DataFrame()

for col in set_scores.columns:
    df = pd.DataFrame()
    df[['winner_'+col, 'loser_'+col]] = set_scores[col].str.split('-', expand = True)
    set_scores_games = pd.concat([set_scores_games, df], axis=1)

for col in set_scores_games.columns:
    set_scores_games[col] = set_scores_games[col].apply(lambda x: x.split('(')[0] if x is not None else x)
    set_scores_games[col] = pd.to_numeric(set_scores_games[col], errors='coerce')

set_scores_games['winner_gameswon'] = set_scores_games[['winner_set_score0', 'winner_set_score1', 
                                                       'winner_set_score2', 'winner_set_score3', 'winner_set_score4']
                                                       ].sum(axis=1)

set_scores_games['loser_gameswon'] = set_scores_games[['loser_set_score0', 'loser_set_score1', 
                                                       'loser_set_score2', 'loser_set_score3', 'loser_set_score4']
                                                       ].sum(axis=1)

df_sorted = df_sorted.join(set_scores_games[['winner_gameswon', 'loser_gameswon']])
df_sorted = df_sorted.rename(columns={'winner_rank_points': 'winner_rankpoints', 'loser_rank_points': 'loser_rankpoints'})

In [5]:
# calculate rolling features
id_list = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level', 'tourney_date', 
           'match_num', 'round', 'best_of', 'minutes']

left_list = []
right_list = []

for col in df_sorted.columns:
    
    if re.search(r'\bw_+', col) or 'winner_' in col:
        left_list.append(col)
    
    if re.search(r'\bl_+', col) or 'loser_' in col:
        right_list.append(col)

left_list = id_list + left_list
right_list = id_list + right_list

df_left = df_sorted[left_list].reset_index(drop=True)
df_right = df_sorted[right_list].reset_index(drop=True)

In [6]:
df_left.columns = ['Player_' + col.split('_')[1] if re.search(r'\bw_+', col) or 'winner_' in col
                     else col for col in df_left.columns]
df_left['def'] = 1 # winner

df_right.columns = ['Player_' + col.split('_')[1] if re.search(r'\bl_+', col) or 'loser_' in col
                     else col for col in df_right.columns]
df_right['def'] = 0 # loser

df_rolling = pd.concat([df_left,df_right], ignore_index=True)
df_rolling_sorted = df_rolling.sort_values(by=['Player_id','tourney_date', 'round']).reset_index(drop=True)

In [7]:
mean_rolling_features = ['Player_ace', 'Player_df', 'Player_svpt', 'Player_1stIn', 'Player_1stWon', 'Player_2ndWon',
       'Player_SvGms', 'Player_bpSaved', 'Player_bpFaced', 'Player_gameswon']

windows = [1, 5, 10]

for x in mean_rolling_features:
    for w in windows:
        df_rolling_sorted[f"last{w}_{x}"] = (
            df_rolling_sorted
            .groupby("Player_id")[x]
            .transform(lambda s, w=w: s.rolling(window=w, min_periods=1).mean().shift(1))
        )

ratio_specs = {
    "pct_1stIn":        ("1stIn", "svpt"),
    "pct_1stWon":       ("1stWon", "1stIn"),
    "pct_2ndWon":       ("2ndWon", lambda p: df_rolling_sorted[f"{p}_svpt"] - df_rolling_sorted[f"{p}_1stIn"]),
    "ace_per_SvGm":     ("ace", "SvGms"),
    "df_per_SvGm":      ("df", "SvGms"),
    "bpFaced_per_SvGm": ("bpFaced", "SvGms"),
    "bpSaved_per_SvGm": ("bpSaved", "SvGms"),
}

for w in windows:
    prefix = f"last{w}_Player"

    for new_name, (num, denom) in ratio_specs.items():

        numerator = df_rolling_sorted[f"{prefix}_{num}"]

        # denominator handling (callable or real column)
        if callable(denom):
            denominator = denom(prefix)
        else:
            denominator = df_rolling_sorted[f"{prefix}_{denom}"]

        safe_ratio = np.where(
            numerator.isna() | denominator.isna() | (denominator == 0),
            np.nan,
            numerator / denominator
        )

        df_rolling_sorted[f"{prefix}_{new_name}"] = safe_ratio

In [8]:
# cumulative court time by player, tournament, and round - RR cumulative cour time is set to NaN 
# because there is no way of knowing which RR match happened first
df_rolling_sorted["tourney_cumulative_minutes_bef_curr"] = (df_rolling_sorted.groupby(
    ["Player_id", "tourney_date", "tourney_name"])["minutes"].transform(lambda s: s.shift(1).cumsum()))

x = df_rolling_sorted.columns.get_loc('round')
for i in range(len(df_rolling_sorted)):
    if df_rolling_sorted.iloc[i,x] == 'RR':
        df_rolling_sorted.iloc[i, -1] = 0

In [9]:
with pd.option_context('display.max_rows', None, 
                       'display.max_columns', None, 
                       'display.width', None, 
                       'display.max_colwidth', None):

    print(df_rolling_sorted.isna().sum().sort_values(ascending = False))

Player_entry                           70321
Player_seed                            56734
tourney_cumulative_minutes_bef_curr    40929
minutes                                 8766
last1_Player_pct_2ndWon                 6656
last1_Player_bpSaved_per_SvGm           6649
last1_Player_bpFaced_per_SvGm           6649
last1_Player_df_per_SvGm                6649
last1_Player_ace_per_SvGm               6649
last1_Player_pct_1stWon                 6645
last1_Player_pct_1stIn                  6643
last1_Player_SvGms                      6637
last1_Player_df                         6637
last1_Player_bpSaved                    6637
last1_Player_ace                        6637
last1_Player_bpFaced                    6637
last1_Player_1stWon                     6637
last1_Player_svpt                       6637
last1_Player_2ndWon                     6637
last1_Player_1stIn                      6637
Player_df                               5484
Player_bpSaved                          5484
Player_1st

In [10]:
# separate features back to left and right
df_left_rolling = df_rolling_sorted.loc[df_rolling_sorted['def'] == 1].reset_index(drop=True)
df_left_rolling.drop(columns=mean_rolling_features, axis=1, inplace=True)
df_left_rolling.drop(columns=['minutes', 'def'], axis=1, inplace=True)

df_right_rolling = df_rolling_sorted.loc[df_rolling_sorted['def'] == 0].reset_index(drop=True)
df_right_rolling.drop(columns=mean_rolling_features, axis=1, inplace=True)
df_right_rolling.drop(columns=['minutes', 'def'], axis=1, inplace=True)

In [11]:
# merge horizontally - engineer fully symmetric data
id_list.remove('minutes')
df_rolling_wide = pd.merge(df_left_rolling, df_right_rolling, on=id_list, how='inner', suffixes=['_A', '_B'])
df_rolling_wide_sym = pd.merge(df_right_rolling, df_left_rolling, on=id_list, how='inner', suffixes=['_A', '_B'])
df_rolling_wide['y'] = 1 # PlayerA wins
df_rolling_wide_sym['y'] = 0 # PlayerA loses

In [12]:
# stack vertically
df_rolling_symmetric = pd.concat([df_rolling_wide, df_rolling_wide_sym], ignore_index=True)
# sort by tourney date, round, match_num, reset index
df_symmetric_sorted = df_rolling_symmetric.sort_values(by=['tourney_date', 'round', 'match_num']).reset_index(drop=True)
print(df_symmetric_sorted.shape)

(85142, 134)


In [15]:
# resolve handedness and height conflicts
def resolve_hand(series):
    """Resolve handedness for one player based on multiple rows."""
    vals = pd.Series(series).dropna().astype(str).str.strip()

    # Normalize to clean categories
    vals = vals.replace({
        'A': 'U',    # treat ambidextrous as unknown
        '': 'U',
        'nan': 'U'
    })

    # If any Right present → assign R
    if 'R' in vals.values:
        return 'R'
    # Else if any Left present → assign L
    if 'L' in vals.values:
        return 'L'
    # Else if U present → Unknown
    if 'U' in vals.values:
        return 'U'

    # Otherwise → set unknown
    return 'U'

# Apply per player
hand_map = (
    df_symmetric_sorted.groupby("Player_name_A")["Player_hand_A"]
      .apply(resolve_hand)
)

# Map back to both A and B player columns
df_symmetric_sorted["Player_hand_A"] = df_symmetric_sorted["Player_name_A"].map(hand_map)
df_symmetric_sorted["Player_hand_B"] = df_symmetric_sorted["Player_name_B"].map(hand_map)

In [17]:
def clean_height(value):
    """Convert raw height values into clean float cm values or NaN."""
    # Missing values
    if pd.isna(value):
        return np.nan
    
    # Extract height from list-like values (rare)
    if isinstance(value, (list, tuple)):
        if len(value) == 0:
            return np.nan
        value = value[0]
    
    # Clean strings
    if isinstance(value, str):
        v = value.strip().lower()
        v = v.replace("cm", "").replace(" ", "")
        
        if v == "" or v == "nan":
            return np.nan
        
        try:
            value = float(v)
        except:
            return np.nan
    
    # Convert to float
    try:
        value = float(value)
    except:
        return np.nan
    
    # Remove impossible or erroneous values
    if value < 140 or value > 225:
        return np.nan
    
    return value

df_symmetric_sorted["Player_ht_A"] = df_symmetric_sorted["Player_ht_A"].apply(clean_height)
df_symmetric_sorted["Player_ht_B"] = df_symmetric_sorted["Player_ht_B"].apply(clean_height)

# multiple heights are allowed bc the player may grow over time

In [18]:
# export
df_symmetric_sorted.to_csv(os.path.join(processed, 'symmetric_processed_not_encoded.csv'), index=False)