In [44]:
import pandas as pd
import numpy as np

# Set tables reading options
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Read csv
df = pd.read_csv('_Data/Original_dataset/preprocessed_data_with_2019_matches.csv', header=0, index_col=0)
players_df = pd.read_csv('_Data/Predictions/players_2019.csv', header=0, index_col=0)

# Convert all numerical values to int
players_df.iloc[:, 1:] = players_df.iloc[:, 1:].apply(pd.to_numeric, downcast='float')
players_df

Unnamed: 0,PlayerA_Name,PlayerA_FR,PlayerA_righthanded,PlayerA_age,PlayerA_rank,PlayerA_rank_points
1,Novak Djokovic,0.0,1.0,31.0,1.0,11070.0
2,Rafael Nadal,0.0,0.0,32.0,2.0,8725.0
3,Alexander Zverev,0.0,1.0,21.0,3.0,6040.0
4,Roger Federer,0.0,1.0,37.0,4.0,5590.0
5,Dominic Thiem,0.0,1.0,25.0,5.0,4765.0
6,Kei Nishikori,0.0,1.0,29.0,6.0,4200.0
7,Kevin Anderson,0.0,1.0,32.0,7.0,4115.0
8,Stefanos Tsitsipas,0.0,1.0,20.0,8.0,3240.0
9,Juan Martin Del Potro,0.0,1.0,30.0,9.0,3225.0
10,John Isner,0.0,1.0,33.0,10.0,3085.0


In [45]:
#-------------------------------------------------------------------------------------------
# COMPUTE LATEST STATS OF EACH PLAYER
#-------------------------------------------------------------------------------------------

# Starting date of Roland-Garros 2019
curr_year = 2019
max_day = 146

# Weights of surface weighting
corr_df = pd.read_csv('_Data/New_stats_dataset/correlation_between_surfaces.csv', header=0, index_col=0)
weight_carpet = corr_df.loc['Clay','Carpet']
weight_grass = corr_df.loc['Clay','Grass']
weight_hard = corr_df.loc['Clay','Hard']
weight_clay = corr_df.loc['Clay','Clay']

# Compute the stats of the players
new_columns = ['PlayerA_Win%',
               'PlayerA_bestof',
               'PlayerA_minutes',
               'PlayerA_svpt%',
               'PlayerA_1st_serve%',
               'PlayerA_1st_serve_won%',
               'PlayerA_2nd_serve_won%',
               'PlayerA_ace%',
               'PlayerA_df%',
               'PlayerA_bp_faced%',
               'PlayerA_bp_saved%']
players_df = players_df.reindex(columns=[*players_df.columns.tolist(), *new_columns])

# Columns of the players' stats
playerA_cols = [2,3,4,7] + list(range(17,25)) + [36,37,38,39]
playerB_cols = [2,3,4,7] + list(range(28,36)) + [36,37,38,39]

for i, player in players_df.iterrows():
    name = player['PlayerA_Name']
    
    # Take all past matches of PLAYER 1 and look for same id in playerA and playerB
    playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerA_df = df.iloc[playerA_rows, playerA_cols]
    playerA_df['Win'] = 1
    playerB_df = df.iloc[playerB_rows, playerB_cols]
    playerB_df['Win'] = 0
    playerB_df.columns = list(playerA_df)
    tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
    if tmp_df.empty:
        continue
    
    # Compute a weight for each past match of the player
    tmp_df['elapsing_time'] = (curr_year + max_day/365) - (tmp_df['Year'] + tmp_df['Day']/365)
    tmp_df['weight'] = tmp_df['elapsing_time'].apply(lambda t: 0.6**t)
    tmp_df.loc[tmp_df['elapsing_time'] <= 1, 'weight'] = 1
    tmp_df['weight'] = (0.95 * tmp_df['weight']) + (0.05 * (weight_carpet*tmp_df['surface_Carpet'] + weight_clay*tmp_df['surface_Clay'] + weight_grass*tmp_df['surface_Grass'] + weight_hard*tmp_df['surface_Hard']))
    tmp_df.drop(columns=['Year', 'Day', 'elapsing_time', 'surface_Carpet', 'surface_Clay', 'surface_Grass', 'surface_Hard'], inplace = True)

    # Compute the weighted average of the player
    weighted_means = np.average(tmp_df, weights=tmp_df['weight'],axis=0)
    weighted_df = pd.DataFrame(weighted_means.reshape(-1, len(weighted_means)), columns=list(tmp_df.columns))
    weighted_df = weighted_df.drop('weight', axis=1)
    
    # Add stats of the player in new dataframe
    players_df.at[i, 9:] = weighted_df.iloc[0, 2:10]
    players_df.at[i, 'PlayerA_bestof'] = weighted_df['best_of']
    players_df.at[i, 'PlayerA_minutes'] = weighted_df['minutes']
    players_df.at[i, 'PlayerA_Win%'] = weighted_df['Win']
    
# Updating columns names
column_names = [s[8:] for s in list(players_df.columns)]
players_df.columns = column_names

# Fill last missing values by median
players_df.fillna(players_df.median(), inplace=True)
players_df

Unnamed: 0,Name,FR,righthanded,age,rank,rank_points,Win%,bestof,minutes,svpt%,1st_serve%,1st_serve_won%,2nd_serve_won%,ace%,df%,bp_faced%,bp_saved%
1,Novak Djokovic,0.0,1.0,31.0,1.0,11070.0,0.840573,3.628799,117.689561,0.477383,0.661034,0.753069,0.581773,0.061714,0.024755,0.051859,0.551664
2,Rafael Nadal,0.0,0.0,32.0,2.0,8725.0,0.862078,3.644879,121.480015,0.476404,0.674588,0.730564,0.585381,0.03991,0.021473,0.061512,0.599861
3,Alexander Zverev,0.0,1.0,21.0,3.0,6040.0,0.713315,3.319305,107.308848,0.49722,0.635669,0.749971,0.526978,0.087063,0.037492,0.060193,0.49931
4,Roger Federer,0.0,1.0,37.0,4.0,5590.0,0.833374,3.514405,101.591005,0.484946,0.619415,0.786559,0.583518,0.093554,0.021617,0.04677,0.52118
5,Dominic Thiem,0.0,1.0,25.0,5.0,4765.0,0.679612,3.431492,111.442098,0.498491,0.595467,0.74874,0.525742,0.069376,0.036019,0.066965,0.582699
6,Kei Nishikori,0.0,1.0,29.0,6.0,4200.0,0.704632,3.443231,113.34896,0.488561,0.611474,0.714618,0.543342,0.043731,0.027346,0.072401,0.564015
7,Kevin Anderson,0.0,1.0,32.0,7.0,4115.0,0.652826,3.478637,122.633567,0.515894,0.639035,0.785842,0.545054,0.142507,0.02855,0.045375,0.554663
8,Stefanos Tsitsipas,0.0,1.0,20.0,8.0,3240.0,0.644902,3.198014,111.033448,0.507591,0.602374,0.76333,0.537733,0.076941,0.026703,0.056592,0.562044
9,Juan Martin Del Potro,0.0,1.0,30.0,9.0,3225.0,0.74485,3.606475,118.709664,0.489131,0.646994,0.767622,0.537921,0.098619,0.023704,0.049028,0.519958
10,John Isner,0.0,1.0,33.0,10.0,3085.0,0.627024,3.426314,118.306111,0.514494,0.687673,0.806648,0.581542,0.175058,0.020827,0.030824,0.550424


In [46]:
players_df.to_csv('_Data/Predictions/stats_players_2019.csv', sep=',', encoding='utf-8', float_format='%.10f', decimal='.')