In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from click import DateTime
from collections import defaultdict

In [160]:
df = pd.read_csv('/Users/anika/Desktop/BEM:EC 120/generated_datasets/wta_all_rally_info.csv')

In [161]:
# github_url_match_data = "https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/refs/heads/master/charting-m-matches.csv"
# df_return_outcomes = pd.read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/refs/heads/master/charting-m-stats-ReturnOutcomes.csv")
# df_serve_outcomes = pd.read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/refs/heads/master/charting-m-stats-ServeBasics.csv")
# df_matches = pd.read_csv(github_url_match_data)

github_url_match_data = "https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/refs/heads/master/charting-w-matches.csv"
df_return_outcomes = pd.read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/refs/heads/master/charting-w-stats-ReturnOutcomes.csv")
df_serve_outcomes = pd.read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_MatchChartingProject/refs/heads/master/charting-w-stats-ServeBasics.csv")
df_matches = pd.read_csv(github_url_match_data)

In [162]:
# Add the columns Surface, Date, Pl 1 hand, Pl 2 hand
df = pd.merge(df, df_matches[['match_id', 'Surface', 'Date', 'Pl 1 hand', 'Pl 2 hand']], on='match_id')

In [163]:
# get types of all columns
df.dtypes

match_id                object
player1                 object
player2                 object
server                   int64
pt_number                int64
game_score              object
side                    object
game_number              int64
player1_sets             int64
player2_sets             int64
player1_games_in_set     int64
player2_games_in_set     int64
1st                     object
2nd                     object
pt_winner                int64
rally_length             int64
error_type              object
Surface                 object
Date                    object
Pl 1 hand               object
Pl 2 hand               object
dtype: object

In [164]:
# convert df['1st'] and df['2nd'] to a list of dictionaries
df['1st'] = df['1st'].apply(lambda x: eval(x) if isinstance(x, str) else None)
df['2nd'] = df['2nd'].apply(lambda x: eval(x) if isinstance(x, str) else None)

In [165]:
# Create a rally column where it is equal to df['1st'] if 1st is not none and 2nd is none, or df['2nd'] if both are not none
df['rally'] = df.apply(lambda x: x['1st'] if x['1st'] is not None and x['2nd'] is None else x['2nd'], axis=1)
df['serve_attempt'] = df.apply(lambda x: 1 if x['1st'] is not None and x['2nd'] is None else 2, axis=1)

In [166]:
# Find how many rallys do not have a serve + 1 shot, meaning that rally_length < 3
print(f'Total number of points with rally_length < 3: {df[df["rally_length"] < 3]["rally_length"].count()}')

Total number of points with rally_length < 3: 137202


In [167]:
df['Date'] = pd.to_datetime(df['Date'])

# Reshape to long format (one row per player per match)
players_df = pd.melt(
    df[['match_id', 'Date', 'player1', 'player2']],
    id_vars=['match_id', 'Date'],
    value_vars=['player1', 'player2'],
    var_name='player_role',
    value_name='player'
)

players_df = players_df.drop_duplicates(subset=['player', 'match_id'])
player_dates = []
split_date = pd.to_datetime('2024-01-01')
evaluated_players = set()

for player, group in players_df.groupby('player'):
    sorted_matches = group.sort_values('Date')
    dates = sorted_matches['Date'].unique()
        
    # Find all players that have at least 2 matches before and 3 after the split date
    if len(dates) >= 5 and dates[1] < split_date and dates[-3] >= split_date:
        evaluated_players.add(player)

In [168]:
# Add a column to df called 'data_split' with values 'train', 'fine_tune', or 'test'
df['data_split'] = None
df.loc[df['Date'] < split_date, 'data_split'] = 'train'
assigned_matches = set()
train_players1 = set(df[df['data_split'] == 'train']['player1'].unique())
train_players2 = set(df[df['data_split'] == 'train']['player2'].unique())

# For each evaluated player, assign fine_tune and test
for player in evaluated_players:
    assert player in train_players1 or player in train_players2

    # Get all matches where the player is server and date >= split_date
    matches = df[((df['player1'] == player) | (df['player2'] == player)) & (df['Date'] >= split_date)].sort_values('Date')

    # Get match IDs in order
    match_ids = matches['match_id'].unique()

    # Skip if not enough matches
    assert len(match_ids) >= 3

    # First two for fine_tune, next match after for test
    fine_tune_ids = match_ids[:2]
    test_ids = match_ids[2:]

    df.loc[df['match_id'].isin(fine_tune_ids), 'data_split'] = 'test'
    df.loc[df['match_id'].isin(test_ids), 'data_split'] = 'test'
    assigned_matches.update(fine_tune_ids)
    assigned_matches.update(test_ids)

In [169]:
players1 = df['player1'].unique()
players2 = df['player2'].unique()
players = np.append(players1, players2)
players = np.unique(players)
print(f'Total number of players: {len(players)}', f'Total number of matches: {df["match_id"].nunique()}', f'Total number of rallies: {len(df)}', sep = '\n')

Total number of players: 490
Total number of matches: 2353
Total number of rallies: 332480


In [170]:
len(df[df['data_split'] == 'train']), len(df[df['data_split'] == 'fine_tune']), len(df[df['data_split'] == 'test'])

(281602, 0, 46816)

In [171]:
def get_pt_diff(score):
    # Score is in the form '1-0' (Tiebreak) or '30-40' (Game)
    # Returns 'ad' for advantage, 'deuce' for deuce
    game_to_pts = {'0': 0, '15': 1, '30': 2, '40': 3, 'AD': 4}
    pt1, pt2 = score.split('-')

    if pt1 in game_to_pts and pt2 in game_to_pts:
        pts1 = game_to_pts[score.split('-')[0]]
        pts2 = game_to_pts[score.split('-')[1]]
    else: # Tiebreak score
        pts1 = int(pt1)
        pts2 = int(pt2)
    
    return pts1 + pts2, pts1 - pts2

def is_break_point(score):
    # Score is in the form '1-0' (Tiebreak) or '30-40' (Game)
    # Returns True if break point, False otherwise
    game_to_pts = {'0': 0, '15': 1, '30': 2, '40': 3, 'AD': 4}
    pt1, pt2 = score.split('-')

    if pt1 in game_to_pts and pt2 in game_to_pts:
        pts1 = game_to_pts[score.split('-')[0]]
        pts2 = game_to_pts[score.split('-')[1]]
    else:
        return False

    return pts2 >= 3 and pts1 < pts2

In [172]:
# Create a dataframe for ML using match_id

def featurize_match(df, match, match_tendencies=None):
    df_match = df[df['match_id'] == match]
    df_match = df_match.sort_values(by='pt_number')

    # 2. For each row, create a dictionary with the following keys:
    # server, opponent, score, hand_server_vs_opponent, surface, court_side, game_in_set, set_in_match
    data = []
    deuces = 0

    p1_err_sides_last20, p2_err_sides_last20 = [], []
    p1_s1_1st_success, p1_s1_2nd_success, p2_s1_1st_success, p2_s1_2nd_success = [], [], [], []
    p1_1st_serve_success, p1_2nd_serve_success, p2_1st_serve_success, p2_2nd_serve_success = [], [], [], []

    serve_encoder = {'wide deuce': '4D', 'wide ad': '4A', 'body deuce': '5D', 'body ad': '5A', 'T deuce': '6D', 'T ad': '6A', 1: 'v1st', 2: 'v2nd'}
    serve_decoder = {v: k for k, v in serve_encoder.items()}

    error_lists = {
        1: p1_err_sides_last20,
        2: p2_err_sides_last20
    }

    s1_success_lists = {
        1: {
            1: p1_s1_1st_success,
            2: p1_s1_2nd_success
        },
        2: {
            1: p2_s1_1st_success,
            2: p2_s1_2nd_success
        }
    }

    serve_success_lists = {
        1: {
            1: p1_1st_serve_success,
            2: p1_2nd_serve_success
        },
        2: {
            1: p2_1st_serve_success,
            2: p2_2nd_serve_success
        }
    }

    for i, row in df_match.iterrows():
        server_idx = row['server']
        opponent_idx = 3 - server_idx
        server = row['player1'] if server_idx == 1 else row['player2']
        opponent = row['player1'] if server_idx == 2 else row['player2']
        server_vs_op_hand = row['Pl 1 hand'] + row['Pl 2 hand'] if server_idx == 1 else row['Pl 2 hand'] + row['Pl 1 hand']
        surface = row['Surface']
        court_side = row['side']
        pt_match = row['pt_number']
        game_adv = row['player1_sets'] - row['player2_sets'] if server_idx == 1 else row['player2_sets'] - row['player1_sets']
        set_adv = row['player1_sets'] - row['player2_sets'] if server_idx == 1 else row['player2_sets'] - row['player1_sets']
        pts_in_game, point_adv = get_pt_diff(row['game_score'])
        if row['game_score'] == '40-40':
            deuces += 1
        if pts_in_game < 6:
            deuces = 0
        if deuces > 0:
            pts_in_game += (2 * (deuces - 1))

        break_point = is_break_point(row['game_score'])

        serve_loc = row['rally'][0]['location']
        if not serve_loc:
            serve_loc = 'unknown'
        first_shot_loc = row['rally'][2]['location'] if len(row['rally']) > 2 else 'no first shot'

        won_pt = row['pt_winner'] == server_idx
        first_serve_attempt = None
        if row['2nd'] and row['1st']:
            first_serve_attempt = row['1st'][0]['location']
        elif row['2nd'] is None:
            first_serve_attempt = 'made first serve'

        forced_err = True if row['error_type'] == 'forced' and won_pt else False

        rally_length = 'no rally'
        if 0 < row['rally_length'] <= 5:
            rally_length = 'short'
        if row['rally_length'] > 5:
            rally_length = 'medium'
        if row['rally_length'] > 10:
            rally_length = 'long'

        returner_success_serve = 0
        if row['serve_attempt'] == 1 and match_tendencies[opponent]['return_err'][1]:
            returner_success_serve = np.mean(match_tendencies[opponent]['return_err'][1])
        elif row['serve_attempt'] == 2 and match_tendencies[opponent]['return_err'][2]:
            returner_success_serve = np.mean(match_tendencies[opponent]['return_err'][2])

        wide_server_win_rate, body_server_win_rate, T_server_win_rate = 0, 0, 0
        if len(serve_success_lists[server_idx][row['serve_attempt']]) > 0:
            wide_server_win_rate = serve_success_lists[server_idx][row['serve_attempt']].count('wide') / len(serve_success_lists[server_idx][row['serve_attempt']])
            body_server_win_rate = serve_success_lists[server_idx][row['serve_attempt']].count('body') / len(serve_success_lists[server_idx][row['serve_attempt']])
            T_server_win_rate = serve_success_lists[server_idx][row['serve_attempt']].count('T') / len(serve_success_lists[server_idx][row['serve_attempt']])


        data.append({
            'match_id': match,
            'server': server,
            'opponent': opponent,
            'hand_combo': server_vs_op_hand,
            'surface': surface,
            'game_advantage': game_adv,
            'set_advantage': set_adv,
            'point_advantage': point_adv,
            'pts_in_game': pts_in_game,
            'pts_in_match': pt_match,
            'is_break_point': break_point,
            'op_error_deuce_side_last20': error_lists[opponent_idx].count('deuce court') / len(error_lists[opponent_idx]) if error_lists[opponent_idx] else 0,
            'op_error_ad_side_last20': error_lists[opponent_idx].count('ad court') / len(error_lists[opponent_idx]) if error_lists[opponent_idx] else 0,
            'op_error_middle_last20': error_lists[opponent_idx].count('middle') / len(error_lists[opponent_idx]) if error_lists[opponent_idx] else 0,
            's1_success_rate_whole_match': np.mean(s1_success_lists[server_idx][row['serve_attempt']]) if s1_success_lists[server_idx][row['serve_attempt']] else 0,
            'wide_returner_win_recent_matches': np.mean(match_tendencies[opponent]['return_err'][f'wide {court_side}']) if match_tendencies[opponent]['return_err'][f'wide {court_side}'] else 0,
            'body_returner_win_recent_matches': np.mean(match_tendencies[opponent]['return_err'][f'body {court_side}']) if match_tendencies[opponent]['return_err'][f'body {court_side}'] else 0,
            'wide_server_win_rate': wide_server_win_rate,
            'body_server_win_rate': body_server_win_rate,
            'T_server_win_rate': T_server_win_rate,
            'T_returner_win_recent_matches': np.mean(match_tendencies[opponent]['return_err'][f'T {court_side}']) if match_tendencies[opponent]['return_err'][f'T {court_side}'] else 0,
            'returner_win_1st/2nd_recent_matches': returner_success_serve,
            's1_success_recent_matches': np.mean(match_tendencies[server]['serve_s1_success'][row['serve_attempt']]) if match_tendencies[server]['serve_s1_success'][row['serve_attempt']] else 0,
            'court_side': court_side,
            'first_serve_attempt': first_serve_attempt,
            'serve_loc': serve_loc,
            'first_shot_loc': first_shot_loc,
            'won_pt': won_pt,
            'is_forced_err': forced_err,
            'rally_category': rally_length,
            'rally_length': row['rally_length'],
            's1_success': rally_length == 'short' and won_pt,
            'data_split': row['data_split']
        })
      
        if len(row['rally']) > 2:
            if row['rally'][-1]['unreturnable'] and row['rally'][-1]['player'] == row['pt_winner']:
                if row['rally'][-1]['location'] and any(loc in row['rally'][-1]['location'] for loc in ['deuce', 'ad', 'mid']):
                    error_lists[3 - row['rally'][-1]['player']].append(row['rally'][-1]['location'])
            if row['rally'][-2]['location'] and row['rally'][-2]['player'] == row['pt_winner']:
                if any(loc in row['rally'][-2]['location'] for loc in ['deuce', 'ad', 'mid']):
                    error_lists[3 - row['rally'][-2]['player']].append(row['rally'][-2]['location'])

        s1_success_lists[server_idx][row['serve_attempt']].append(rally_length == 'short' and won_pt)

        if won_pt:
            serve_success_lists[server_idx][row['serve_attempt']].append(serve_loc)
        else:
            serve_success_lists[server_idx][row['serve_attempt']].append('lost')

        if len(p1_err_sides_last20) > 20:
            p1_err_sides_last20.pop(0)
        if len(p2_err_sides_last20) > 20:
            p2_err_sides_last20.pop(0)
        

    positions = ['4D', '4A', '5D', '5A', '6D', '6A', 'v1st', 'v2nd']
    df_filtered = (
        df_return_outcomes
        .loc[df_return_outcomes['match_id'] == match]
        .query('row in @positions')
        .assign(win_percentage=lambda x: x['returnable_won'] / x['returnable'])
    )

    player_position_stats = (
        df_filtered[['player', 'row', 'win_percentage']]
        .groupby(['player', 'row'])
        .first()['win_percentage']
        .to_dict()
    )

    for (player, position), win_pct in player_position_stats.items():
        tendency_key = serve_decoder[position]
        match_tendencies[player]['return_err'][tendency_key].append(win_pct)
        
        # Maintain only last 5 matches
        if len(match_tendencies[player]['return_err'][tendency_key]) > 5:
            match_tendencies[player]['return_err'][tendency_key].pop(0)

    positions = ['1', '2']
    df_filtered = (
        df_serve_outcomes
        .loc[df_serve_outcomes['match_id'] == match]
        .query('row in @positions')
        .assign(win_percentage=lambda x: x['pts_won_lte_3_shots'] / x['pts_won'])
    )

    player_position_stats = (
        df_filtered[['player', 'row', 'win_percentage']]
        .groupby(['player', 'row'])
        .first()['win_percentage']
        .to_dict()
    )

    for (player, position), win_pct in player_position_stats.items():
        position = int(position)
        match_tendencies[player]['serve_s1_success'][position].append(win_pct)
        
        # Maintain only last 5 matches
        if len(match_tendencies[player]['serve_s1_success'][position]) > 5:
            match_tendencies[player]['serve_s1_success'][position].pop(0)

    return data

In [None]:
all_data = []
match_ids = df['match_id'].unique()
match_tendencies = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for i, match in enumerate(match_ids):
    if i % 100 == 0:
        print(f'{i} of {len(match_ids)}')
    data = featurize_match(df, match, match_tendencies)
    all_data.extend(data)

features_df = pd.DataFrame(all_data)
features_df['hand_combo'] = features_df['hand_combo'].str.replace(' ', '').str.lower()
features_df['first_shot_loc'] = features_df['first_shot_loc'].apply(lambda x: None if x not in {'ad court', 'deuce court', 'middle', 'no first shot'} else x)

In [None]:
# print the values of each column
for col in features_df.columns[3:]:
    print(f'{col}: {features_df[col].unique()}')
    print(f'{col}: {features_df[col].nunique()}')
    print('\n')

In [176]:
# Save to csv called features_basic.csv
features_df.to_csv('generated_datasets/wta_features_0425.csv', index=False)