## Predict ATP tour match results!

In [1]:
import pandas as pd
import datetime as dt
import pickle

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score

pd.set_option('display.max_columns', None)

## Load dataset and engineer features
On my laptop this takes about 45 minutes to run if ENGINEER_FEATURES = True

In [None]:
# @TODO: more features to get round to testing:
# - H2H win proportion in last x days or last y matches
# - H2H win proportion on that surface
# - an intelligent indicator of match importance/pressure. This could be
#   a bool for grand slam, a bool for final or semi or both or some combination
# - number of H2H matches? Feels like there needs to be added context for the
#   H2H stat - a 100% H2H is misleading if they've only played each other once

# @TODO: think more about default values if no data for features. Gradient boost
# accepts NaNs so worth trying NaN as default

# choose whether to create engineered CSV or use
# current local file
ENGINEER_FEATURES = False

if ENGINEER_FEATURES:
    # load dataset
    df = pd.read_csv('atp_tennis.csv', index_col=0)
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values(by='Date', inplace=True)

    # engineer some features that we can use later
    # we could get some information out of the date of each match
    # we'll try splitting it into month and year
    df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year

    # calculate the form (win proportion) of the winner and loser
    # in last x days before match
    def calculate_form(match, winner_or_loser:str, days:int=90):
        player = match[winner_or_loser]

        # return df of all matches that player has played in 90 days before match
        played = (df['Winner'] == player) | (df['Loser'] == player)
        timerange = (df['Date'] > match['Date'] - pd.Timedelta(days=days)) & (df['Date'] < match['Date'])
        matches = df.loc[timerange & played]

        # calculate form if they have played in last 90 days
        if len(matches) > 0:
            wins = len(matches.loc[matches['Winner'] == player])
            return wins / len(matches)
        else:
            return 0

    # calculate all time win proportion on that surface
    def calculate_surface_win_proportion(match, winner_or_loser:str):
        player = match[winner_or_loser]
        surface = match['Surface']

        # return df of all matches that player has played on that surface
        # previous to this match
        timerange = df['Date'] < match['Date']
        played = (df['Winner'] == player) | (df['Loser'] == player)
        surface = df['Surface'] == surface
        matches = df.loc[timerange & played & surface]

        # calculate win proportion if they have played on that surface
        if len(matches) > 0:
            wins = len(matches.loc[matches['Winner'] == player])
            return wins / len(matches)
        else:
            return 0

    # calculate all time head to head win proportion of winner
    def calculate_winner_h2h_win_proportion(match):
        winner = match['Winner']
        loser = match['Loser']

        # return df of all matches that player has played against opponent
        # previous to this match
        timerange = df['Date'] < match['Date']
        winner_won = (df['Winner'] == winner) & (df['Loser'] == loser)
        loser_won = (df['Winner'] == loser) & (df['Loser'] == winner)
        both_played = winner_won | loser_won
        matches = df.loc[timerange & both_played]

        # calculate win proportion if they have played against opponent
        if len(matches) > 0:
            winner_wins = len(matches.loc[matches['Winner'] == winner])
            return winner_wins / len(matches)
        else:
            return 0.5
    
    # calculate number of all time head to head matches
    def calculate_h2h_count(match):
        winner = match['Winner']
        loser = match['Loser']

        # return df of all matches that player has played against opponent
        # previous to this match
        timerange = df['Date'] < match['Date']
        winner_won = (df['Winner'] == winner) & (df['Loser'] == loser)
        loser_won = (df['Winner'] == loser) & (df['Loser'] == winner)
        both_played = winner_won | loser_won
        matches = df.loc[timerange & both_played]

        return len(matches)

    df['WSurfaceWinProportion'] = df.apply(calculate_surface_win_proportion, winner_or_loser='Winner', axis=1)
    df['LSurfaceWinProportion'] = df.apply(calculate_surface_win_proportion, winner_or_loser='Loser', axis=1)

    df['WH2HWinProportion'] = df.apply(calculate_winner_h2h_win_proportion, axis=1)
    df['LH2HWinProportion'] = 1 - df['WH2HWinProportion']

    df['H2HCount'] = df.apply(calculate_h2h_count, axis=1)

    FORM_DAYS = 90
    df['WForm'] = df.apply(calculate_form, winner_or_loser='Winner', days=FORM_DAYS, axis=1)
    df['LForm'] = df.apply(calculate_form, winner_or_loser='Loser', days=FORM_DAYS, axis=1)

    # save as CSV
    df.to_csv('atp_tennis_engineered.csv')

# load engineered dataset
df = pd.read_csv('atp_tennis_engineered.csv', index_col=0)
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values(by='Date', inplace=True)

## Transform columns for ML
- Change references from Winner/Loser to Player A/Player B
- Add Winner as target column

In [3]:
# since we have form in last x days, we need to drop at least the first x days
# for now, drop first year to cover us
df = df.loc[df['Date'] > df['Date'].min() + pd.Timedelta(days=365)].reset_index(drop=True)

# randomly assign Player A and Player B to each match. First we'll
# split the dataset randomly
whole_df = df.copy()
A_df = whole_df.sample(frac=0.5, replace=False, random_state=1)
B_df = whole_df.drop(A_df.index)

# in A_df, the winner is assigned A and the loser is assigned B.
# in B_df, the winner is assigned B and the loser is assigned A.
A_df = A_df.rename(columns={'Winner': 'Player A', 'Loser': 'Player B', 'WRank': 'Rank A', 'LRank': 'Rank B',
                          'WForm': 'Form A', 'LForm': 'Form B',
                          'WSurfaceWinProportion': 'SurfaceWinProportion A', 'LSurfaceWinProportion': 'SurfaceWinProportion B',
                          'WH2HWinProportion': 'H2HWinProportion A', 'LH2HWinProportion': 'H2HWinProportion B'})

B_df = B_df.rename(columns={'Winner': 'Player B', 'Loser': 'Player A', 'WRank': 'Rank B', 'LRank': 'Rank A',
                          'WForm': 'Form B', 'LForm': 'Form A',
                          'WSurfaceWinProportion': 'SurfaceWinProportion B', 'LSurfaceWinProportion': 'SurfaceWinProportion A',
                          'WH2HWinProportion': 'H2HWinProportion B', 'LH2HWinProportion': 'H2HWinProportion A'})

# construct label/target feature - Winner is 1 if player A won
# and 0 if player B won
A_df['Winner'] = 1
B_df['Winner'] = 0

# join dataframes and shuffle
full_df = pd.concat([A_df, B_df]).sample(frac=1, random_state=1).reset_index(drop=True)

## Machine learning

In [None]:
# define feature sets (FS):
fs_1 = ['Tournament', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'Player A', 'Player B', 'Rank A', 'Rank B', 'Year', 'Month']

fs_2 = ['Rank A', 'Rank B', 'Form A', 'Form B', 'SurfaceWinProportion A', 'SurfaceWinProportion B', 'H2HWinProportion A', 'H2HWinProportion B']

fs_3 = ['Rank A', 'Rank B', 'Form A', 'Form B', 'SurfaceWinProportion A', 'SurfaceWinProportion B', 'H2HWinProportion A', 'H2HWinProportion B', 'H2HCount']

feature_sets = [fs_2, fs_3]
feature_sets_names = ['FS2', 'FS3']

def calculate_ml_metrics(name, y_test, pred_test, y_train, pred_train) -> None:

    results_df.loc[name, 'accuracy_train'] = accuracy_score(y_train, pred_train)
    results_df.loc[name, 'accuracy_test'] = accuracy_score(y_test, pred_test)

    results_df.loc[name, 'f1_train'] = f1_score(y_train, pred_train)
    results_df.loc[name, 'f1_test'] = f1_score(y_test, pred_test)

    results_df.loc[name, 'roc_auc_train'] = roc_auc_score(y_train, pred_train)
    results_df.loc[name, 'roc_auc_test'] = roc_auc_score(y_test, pred_test)

results_df = pd.DataFrame()

# one-hot encode categorical features
one_hot_df = pd.get_dummies(full_df)

# split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(one_hot_df.drop(columns=['Winner']), one_hot_df['Winner'], test_size=0.2, random_state=42)
print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# now for the models. We'll begin with a 'dumb' benchmark model: predict the better ranked player to win
print(f"+ who's ranked better wins")
pred_train = X_train['Rank A'] < X_train['Rank B']
pred_test = X_test['Rank A'] < X_test['Rank B']
calculate_ml_metrics('better_rank', y_test, pred_test, y_train, pred_train)

for fs, fs_name in zip(feature_sets, feature_sets_names):
    print(f"+ {fs_name}")

    # cut down X_train and X_test to just selected features. As we've one-hot encoded
    # we have to look for columns that contain each feature name
    feature_columns = one_hot_df.columns[one_hot_df.columns.str.contains('|'.join(fs))]
    X_train_fs = X_train[feature_columns]
    X_test_fs = X_test[feature_columns]

    # logistic regression
    # print(f"    - logistic regression")
    # lr = LogisticRegression()
    # lr.fit(X_train_fs, y_train)
    # pred_train = lr.predict(X_train_fs)
    # pred_test = lr.predict(X_test_fs)
    # calculate_ml_metrics(f"{fs_name}_lr", y_test, pred_test, y_train, pred_train)

    # random forest
    # print(f"    - random forest")
    # rf = RandomForestClassifier(n_estimators=150, max_depth=6, random_state=42)
    # rf.fit(X_train_fs, y_train)
    # pred_train = rf.predict(X_train_fs)
    # pred_test = rf.predict(X_test_fs)
    # calculate_ml_metrics(f"{fs_name}_rf", y_test, pred_test, y_train, pred_train)

    # gradient boosting
    print(f"    - gradient boosting")
    gb_max_depth = None
    gb = HistGradientBoostingClassifier(max_depth=gb_max_depth, random_state=42)
    gb.fit(X_train_fs, y_train)
    pred_train = gb.predict(X_train_fs)
    pred_test = gb.predict(X_test_fs)
    calculate_ml_metrics(f"{fs_name}_gb", y_test, pred_test, y_train, pred_train)

    # save gradient boosting model as pickle to local file if FS2. First add metadata
    if fs_name == 'FS2':
        print(f"        - saving model")

        gb.metadata = {
            'features': X_train_fs.columns.tolist(),
            'feature_set_name': fs_name,
            'model': 'gb',
            'date_created': dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'model_params': {
                'max_depth': gb_max_depth,
                'random_state': 42
            },
            'model_metrics': {
                'accuracy_train': accuracy_score(y_train, pred_train),
                'accuracy_test': accuracy_score(y_test, pred_test),
            }
        }

        with open('gb_model.pkl', 'wb') as f:
            pickle.dump(gb, f)

Train set size: 49735
Test set size: 12434
+ who's ranked better wins
+ FS2
    - gradient boosting
        - saving model
+ FS3
    - gradient boosting


In [25]:
colors = ['red', 'blue', 'green', 'orange', 'purple', 'pink', 'yellow', 'brown', 'grey', 'black']

fig = make_subplots(rows=3, cols=1, subplot_titles=('Accuracy', 'F1', 'ROC AUC'))

fig.add_trace(go.Bar(x=results_df.index, y=results_df.accuracy_train, name='accuracy_train', marker_color=colors, opacity=0.5),
                row=1, col=1)
fig.add_trace(go.Bar(x=results_df.index, y=results_df.accuracy_test, name='accuracy_test', marker_color=colors),
              row=1, col=1)

fig.add_trace(go.Bar(x=results_df.index, y=results_df.f1_train, name='f1_train', marker_color=colors, opacity=0.5),
                row=2, col=1)
fig.add_trace(go.Bar(x=results_df.index, y=results_df.f1_test, name='f1_test', marker_color=colors),
                row=2, col=1)

fig.add_trace(go.Bar(x=results_df.index, y=results_df.roc_auc_train, name='roc_auc_train', marker_color=colors, opacity=0.5),
                row=3, col=1)
fig.add_trace(go.Bar(x=results_df.index, y=results_df.roc_auc_test, name='roc_auc_test', marker_color=colors),
                row=3, col=1)

fig.update_layout(title_text = 'Model Performance', height=1000)

fig.update_yaxes(range=[0.5, 1])
