In [34]:
import pandas as pd
import numpy as np

# Set tables reading options
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Read csv
df = pd.read_csv('_Data/Original_dataset/preprocessed_data_with_2019_matches.csv', header=0, index_col=0)
players_df = pd.read_csv('_Data/Predictions/players_2019.csv', header=0, index_col=0)

# Convert all numerical values to int
players_df.iloc[:, 1:] = players_df.iloc[:, 1:].apply(pd.to_numeric, downcast='float')
players_df

Unnamed: 0,PlayerA_name,PlayerA_FR,PlayerA_righthanded,PlayerA_age,PlayerA_rank,PlayerA_rank_points
1,Novak Djokovic,0.0,1.0,31.0,1.0,11070.0
2,Rafael Nadal,0.0,0.0,32.0,2.0,8725.0
3,Alexander Zverev,0.0,1.0,21.0,3.0,6040.0
4,Roger Federer,0.0,1.0,37.0,4.0,5590.0
5,Dominic Thiem,0.0,1.0,25.0,5.0,4765.0
6,Kei Nishikori,0.0,1.0,29.0,6.0,4200.0
7,Kevin Anderson,0.0,1.0,32.0,7.0,4115.0
8,Stefanos Tsitsipas,0.0,1.0,20.0,8.0,3240.0
9,Juan Martin del Potro,0.0,1.0,30.0,9.0,3225.0
10,John Isner,0.0,1.0,33.0,10.0,3085.0


In [32]:
#-------------------------------------------------------------------------------------------
# COMPUTE LATEST STATS OF EACH PLAYER
#-------------------------------------------------------------------------------------------

# Starting date of Roland-Garros 2019
curr_year = 2019
max_day = 146

# Weights of surface weighting
corr_df = pd.read_csv('_Data/New_stats_dataset/correlation_between_surfaces.csv', header=0, index_col=0)
weight_carpet = corr_df.loc['Clay','Carpet']
weight_grass = corr_df.loc['Clay','Grass']
weight_hard = corr_df.loc['Clay','Hard']
weight_clay = corr_df.loc['Clay','Clay']

# Compute the stats of the players
new_columns = ['PlayerA_Win%',
               'PlayerA_bestof',
               'PlayerA_minutes',
               'PlayerA_svpt%',
               'PlayerA_1st_serve%',
               'PlayerA_1st_serve_won%',
               'PlayerA_2nd_serve_won%',
               'PlayerA_ace%',
               'PlayerA_df%',
               'PlayerA_bp_faced%',
               'PlayerA_bp_saved%']
players_df = players_df.reindex(columns=[*players_df.columns.tolist(), *new_columns])

# Columns of the players' stats
playerA_cols = [2,3,4,7] + list(range(17,25)) + [36,37,38,39]
playerB_cols = [2,3,4,7] + list(range(28,36)) + [36,37,38,39]

for i, player in players_df.iterrows():
    name = player['PlayerA_Name']
    
    # Take all past matches of PLAYER 1 and look for same id in playerA and playerB
    playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerA_df = df.iloc[playerA_rows, playerA_cols]
    playerA_df['Win'] = 1
    playerB_df = df.iloc[playerB_rows, playerB_cols]
    playerB_df['Win'] = 0
    playerB_df.columns = list(playerA_df)
    tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
    if tmp_df.empty:
        continue
    
    # Compute a weight for each past match of the player
    tmp_df['elapsing_time'] = (curr_year + max_day/365) - (tmp_df['Year'] + tmp_df['Day']/365)
    tmp_df['weight'] = tmp_df['elapsing_time'].apply(lambda t: 0.6**t)
    tmp_df.loc[tmp_df['elapsing_time'] <= 1, 'weight'] = 1
    tmp_df['weight'] = (0.95 * tmp_df['weight']) + (0.05 * (weight_carpet*tmp_df['surface_Carpet'] + weight_clay*tmp_df['surface_Clay'] + weight_grass*tmp_df['surface_Grass'] + weight_hard*tmp_df['surface_Hard']))
    tmp_df.drop(columns=['Year', 'Day', 'elapsing_time', 'surface_Carpet', 'surface_Clay', 'surface_Grass', 'surface_Hard'], inplace = True)

    # Compute the weighted average of the player
    weighted_means = np.average(tmp_df, weights=tmp_df['weight'],axis=0)
    weighted_df = pd.DataFrame(weighted_means.reshape(-1, len(weighted_means)), columns=list(tmp_df.columns))
    weighted_df = weighted_df.drop('weight', axis=1)
    
    # Add stats of the player in new dataframe
    players_df.at[i, 9:] = weighted_df.iloc[0, 2:10]
    players_df.at[i, 'PlayerA_bestof'] = weighted_df['best_of']
    players_df.at[i, 'PlayerA_minutes'] = weighted_df['minutes']
    players_df.at[i, 'PlayerA_Win%'] = weighted_df['Win']
    
# Updating columns names
column_names = [s[8:] for s in list(players_df.columns)]
players_df.columns = column_names

# Fill last missing values by median
players_df.fillna(players_df.median(), inplace=True)
players_df

Unnamed: 0,Name,FR,righthanded,age,rank,rank_points,Win%,bestof,minutes,svpt%,1st_serve%,1st_serve_won%,2nd_serve_won%,ace%,df%,bp_faced%,bp_saved%
1,Maximilian Marterer,0.0,1.0,23.0,101.0,594.0,0.407572,3.321623,100.532973,0.510377,0.626837,0.702074,0.540796,0.071214,0.022097,0.076351,0.547472
2,Roberto Carballes Baena,0.0,1.0,26.0,103.0,578.0,0.453234,3.284189,113.684119,0.491642,0.620324,0.698993,0.514664,0.037323,0.037174,0.085038,0.568504
3,Filip Krajinovic,0.0,1.0,27.0,104.0,576.0,0.493614,3.112264,100.25999,0.483757,0.623126,0.665844,0.509051,0.062943,0.026422,0.086023,0.550804
4,Denis Istomin,0.0,1.0,32.0,105.0,571.0,0.399835,3.31617,105.698588,0.498175,0.662671,0.716372,0.510863,0.077546,0.019757,0.07148,0.537572
5,Pedro Sousa,0.0,1.0,30.0,106.0,568.0,0.191023,3.0,89.440741,0.475733,0.68443,0.628838,0.542234,0.032107,0.018486,0.096495,0.490177
6,Marcel Granollers,0.0,1.0,33.0,107.0,557.0,0.439314,3.260212,104.046472,0.500451,0.609177,0.691409,0.516299,0.053884,0.028498,0.084431,0.500622
7,Yannick Maden,0.0,1.0,29.0,108.0,552.0,0.385076,3.244292,116.957987,0.480255,0.625611,0.653685,0.508722,0.030998,0.02245,0.098123,0.556929
8,Ryan Harrison,0.0,1.0,26.0,109.0,545.0,0.443499,3.28819,103.699488,0.504687,0.593429,0.733215,0.520612,0.087546,0.036734,0.070516,0.58544
9,Paolo Lorenzi,0.0,1.0,37.0,110.0,545.0,0.386934,3.410456,116.732215,0.503504,0.586043,0.698963,0.505381,0.050567,0.030288,0.083952,0.570506
10,Tennys Sandgren,0.0,1.0,27.0,111.0,543.0,0.418572,3.332564,107.774336,0.514232,0.611228,0.741867,0.514169,0.093038,0.025132,0.070141,0.620393


In [33]:
players_df.to_csv('_Data/Predictions/stats_players_2019_qualifications.csv', sep=',', encoding='utf-8', float_format='%.10f', decimal='.')