In [41]:
import pandas as pd
import numpy as np

# Set tables reading options
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Read csv
df = pd.read_csv('_Data/Original_dataset/preprocessed_data.csv', header=0, index_col=0)
players_df = pd.read_csv('_Data/Predictions/players_2016.csv', header=0, index_col=0)
players_df

Unnamed: 0,PlayerA_Name
1,Novak Djokovic
2,Andy Murray
3,Stanislas Wawrinka
4,Rafael Nadal
5,Kei Nishikori
6,Jo Wilfried Tsonga
7,Tomas Berdych
8,Milos Raonic
9,Richard Gasquet
10,Marin Cilic


In [42]:
#-------------------------------------------------------------------------------------------
# GET LATEST AGE, RANK, RANKING POINTS OF EACH PLAYER
#-------------------------------------------------------------------------------------------

# Limit date before Roland Garros
#curr_year = 2018 
#max_day = 148
#curr_year = 2017
#max_day = 149
curr_year = 2016
max_day = 143

# Create new dataframe to contain info about the 128 players
index = np.array(np.arange(1,129))
columns = ['PlayerA_FR',
          'PlayerA_righthanded',
           'PlayerA_age',
           'PlayerA_rank',
           'PlayerA_rank_points']
new_df = pd.DataFrame(index=index, columns=columns)

# Columns of the players' stats
playerA_cols = [2,3,10,12,14,15,16]
playerB_cols = [2,3,11,13,25,26,27]

for i, player in players_df.iterrows():
    name = player['PlayerA_Name']

    # Get ranking, rank points and age of player just before Roland Garros
    playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerA_df = df.iloc[playerA_rows, playerA_cols]
    playerB_df = df.iloc[playerB_rows, playerB_cols]
    playerB_df.columns = list(playerA_df)
    tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
    if tmp_df.empty:
        continue
        
    # Sort by latest date
    tmp_df.sort_values(by=['Year', 'Day'], ascending=[False, False], inplace=True)

    # Add it in the new df
    new_df.at[i,:] = tmp_df.iloc[0,:]

# Fill last missing values by median
new_df.fillna(new_df.median(), inplace=True)
new_df

Unnamed: 0,PlayerA_FR,PlayerA_righthanded,PlayerA_age,PlayerA_rank,PlayerA_rank_points
1,0.0,1.0,28.966461,1.0,16550.0
2,0.0,1.0,28.985626,3.0,7525.0
3,0.0,1.0,31.13484,4.0,6110.0
4,0.0,0.0,29.932922,5.0,5675.0
5,0.0,1.0,26.360027,6.0,4290.0
6,1.0,1.0,31.041752,7.0,3400.0
7,0.0,1.0,30.642027,8.0,2940.0
8,0.0,1.0,25.366188,10.0,2740.0
9,1.0,1.0,29.891855,12.0,2680.0
10,0.0,1.0,27.63039,11.0,2715.0


In [43]:
#-------------------------------------------------------------------------------------------
# COMPUTE LATEST STATS OF EACH PLAYER
#-------------------------------------------------------------------------------------------

# Weights of surface weighting
corr_df = pd.read_csv('_Data/New_stats_dataset/correlation_between_surfaces.csv', header=0, index_col=0)
weight_carpet = corr_df.loc['Clay','Carpet']
weight_grass = corr_df.loc['Clay','Grass']
weight_hard = corr_df.loc['Clay','Hard']
weight_clay = corr_df.loc['Clay','Clay']

# Compute the stats of the players
new_columns = ['PlayerA_Win%',
               'PlayerA_bestof',
               'PlayerA_minutes',
               'PlayerA_svpt%',
               'PlayerA_1st_serve%',
               'PlayerA_1st_serve_won%',
               'PlayerA_2nd_serve_won%',
               'PlayerA_ace%',
               'PlayerA_df%',
               'PlayerA_bp_faced%',
               'PlayerA_bp_saved%']
new_df = new_df.reindex(columns=[*new_df.columns.tolist(), *new_columns])

# Columns of the players' stats
playerA_cols = [2,3,4,7] + list(range(17,25)) + [36,37,38,39]
playerB_cols = [2,3,4,7] + list(range(28,36)) + [36,37,38,39]

for i, player in players_df.iterrows():
    name = player['PlayerA_Name']
    
    # Take all past matches of PLAYER 1 and look for same id in playerA and playerB
    playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerA_df = df.iloc[playerA_rows, playerA_cols]
    playerA_df['Win'] = 1
    playerB_df = df.iloc[playerB_rows, playerB_cols]
    playerB_df['Win'] = 0
    playerB_df.columns = list(playerA_df)
    tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
    if tmp_df.empty:
        continue
    
    # Compute a weight for each past match of the player
    tmp_df['elapsing_time'] = (curr_year + max_day/365) - (tmp_df['Year'] + tmp_df['Day']/365)
    tmp_df['weight'] = tmp_df['elapsing_time'].apply(lambda t: 0.6**t)
    tmp_df.loc[tmp_df['elapsing_time'] <= 1, 'weight'] = 1
    tmp_df['weight'] = (0.95 * tmp_df['weight']) + (0.05 * (weight_carpet*tmp_df['surface_Carpet'] + weight_clay*tmp_df['surface_Clay'] + weight_grass*tmp_df['surface_Grass'] + weight_hard*tmp_df['surface_Hard']))
    tmp_df.drop(columns=['Year', 'Day', 'elapsing_time', 'surface_Carpet', 'surface_Clay', 'surface_Grass', 'surface_Hard'], inplace = True)

    # Compute the weighted average of the player
    weighted_means = np.average(tmp_df, weights=tmp_df['weight'],axis=0)
    weighted_df = pd.DataFrame(weighted_means.reshape(-1, len(weighted_means)), columns=list(tmp_df.columns))
    weighted_df = weighted_df.drop('weight', axis=1)
    
    # Add stats of the player in new dataframe
    new_df.at[i, 8:] = weighted_df.iloc[0, 2:10]
    new_df.at[i, 'PlayerA_bestof'] = weighted_df['best_of']
    new_df.at[i, 'PlayerA_minutes'] = weighted_df['minutes']
    new_df.at[i, 'PlayerA_Win%'] = weighted_df['Win']
    
# Concat and updating columns names
new_df = pd.concat([players_df, new_df], axis=1)
column_names = [s[8:] for s in list(new_df.columns)]
new_df.columns = column_names

# Fill last missing values by median
new_df.fillna(new_df.median(), inplace=True)
new_df

Unnamed: 0,Name,FR,righthanded,age,rank,rank_points,Win%,bestof,minutes,svpt%,1st_serve%,1st_serve_won%,2nd_serve_won%,ace%,df%,bp_faced%,bp_saved%
1,Novak Djokovic,0.0,1.0,28.966461,1.0,16550.0,0.885614,3.612817,112.689352,0.482623,0.658613,0.750467,0.583261,0.063613,0.022047,0.052525,0.559056
2,Andy Murray,0.0,1.0,28.985626,3.0,7525.0,0.796277,3.637335,120.868693,0.491493,0.59875,0.75344,0.533311,0.075475,0.027515,0.064552,0.570807
3,Stanislas Wawrinka,0.0,1.0,31.13484,4.0,6110.0,0.704663,3.503089,110.995367,0.503236,0.574703,0.756727,0.557363,0.076707,0.027602,0.061269,0.577922
4,Rafael Nadal,0.0,0.0,29.932922,5.0,5675.0,0.816974,3.434207,115.747499,0.476067,0.696307,0.721567,0.56832,0.035023,0.018406,0.064985,0.576296
5,Kei Nishikori,0.0,1.0,26.360027,6.0,4290.0,0.743902,3.413433,106.634699,0.483016,0.607943,0.724186,0.541272,0.041942,0.029202,0.068411,0.589158
6,Jo Wilfried Tsonga,1.0,1.0,31.041752,7.0,3400.0,0.672005,3.547243,109.660201,0.512713,0.6141,0.767849,0.543298,0.095682,0.026339,0.056388,0.598519
7,Tomas Berdych,0.0,1.0,30.642027,8.0,2940.0,0.701374,3.449327,107.626862,0.495549,0.574813,0.775241,0.538264,0.086986,0.027738,0.058871,0.5517
8,Milos Raonic,0.0,1.0,25.366188,10.0,2740.0,0.722723,3.416579,107.79561,0.518506,0.624257,0.814428,0.55998,0.144379,0.02995,0.042647,0.577429
9,Richard Gasquet,1.0,1.0,29.891855,12.0,2680.0,0.670074,3.391754,103.386905,0.493892,0.620309,0.730931,0.547695,0.060944,0.025489,0.067294,0.549296
10,Marin Cilic,0.0,1.0,27.63039,11.0,2715.0,0.651904,3.431714,110.654819,0.503783,0.565914,0.76766,0.530239,0.098204,0.027875,0.06086,0.581903


In [45]:
new_df.to_csv('_Data/Predictions/stats_players_2016_weight06.csv', sep=',', encoding='utf-8', float_format='%.10f', decimal='.')