In [9]:
import pandas as pd
import numpy as np

# Set tables reading options
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Read csv
df = pd.read_csv('_Data/Original_dataset/preprocessed_data.csv', header=0, index_col=0)
players_df = pd.read_csv('_Data/Predictions/players_2017.csv', header=0, index_col=0)

In [10]:
#-------------------------------------------------------------------------------------------
# GET LATEST AGE, RANK, RANKING POINTS OF EACH PLAYER
#-------------------------------------------------------------------------------------------

# Create new dataframe to contain info about the 128 players
index = np.array(np.arange(1,129))
columns = ['PlayerA_FR',
          'PlayerA_righthanded',
           'PlayerA_age',
           'PlayerA_rank',
           'PlayerA_rank_points']
new_df = pd.DataFrame(index=index, columns=columns)

# Limit date before Roland Garros 2018
#curr_year = 2018
#max_day = 148
curr_year = 2017
max_day = 149

# Columns of the players' stats
playerA_cols = [2,3,10,12,14,15,16]
playerB_cols = [2,3,11,13,25,26,27]

for i, player in players_df.iterrows():
    name = player['PlayerA_Name']

    # Get ranking, rank points and age of player just before Roland Garros
    playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerA_df = df.iloc[playerA_rows, playerA_cols]
    playerB_df = df.iloc[playerB_rows, playerB_cols]
    playerB_df.columns = list(playerA_df)
    tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
    if tmp_df.empty:
        continue

    # Sort by latest date
    tmp_df.sort_values(by=['Year', 'Day'], ascending=[False, False], inplace=True)

    # Add it in the new df
    new_df.at[i,:] = tmp_df.iloc[0,:]

# Fill last missing values by median
new_df.fillna(new_df.median(), inplace=True)
new_df

Unnamed: 0,PlayerA_FR,PlayerA_righthanded,PlayerA_age,PlayerA_rank,PlayerA_rank_points
1,0.0,1.0,30.001369,1.0,10360.0
2,0.0,1.0,29.982204,2.0,6845.0
3,0.0,1.0,32.150581,3.0,5445.0
4,0.0,0.0,30.948666,4.0,5195.0
5,0.0,1.0,26.401094,6.0,4360.0
6,0.0,1.0,23.696098,7.0,4035.0
7,0.0,1.0,28.626968,8.0,3735.0
8,0.0,1.0,27.394936,9.0,3560.0
9,0.0,1.0,20.068447,17.0,2300.0
10,0.0,1.0,26.436687,10.0,3055.0


In [11]:
#-------------------------------------------------------------------------------------------
# COMPUTE LATEST STATS OF EACH PLAYER
#-------------------------------------------------------------------------------------------

# Weights of surface weighting
corr_df = pd.read_csv('_Data/New_stats_dataset/correlation_between_surfaces.csv', header=0, index_col=0)
weight_carpet = corr_df.loc['Clay','Carpet']
weight_grass = corr_df.loc['Clay','Grass']
weight_hard = corr_df.loc['Clay','Hard']
weight_clay = corr_df.loc['Clay','Clay']

# Compute the stats of the players
new_columns = ['PlayerA_Win%',
               'PlayerA_bestof',
               'PlayerA_minutes',
               'PlayerA_svpt%',
               'PlayerA_1st_serve%',
               'PlayerA_1st_serve_won%',
               'PlayerA_2nd_serve_won%',
               'PlayerA_ace%',
               'PlayerA_df%',
               'PlayerA_bp_faced%',
               'PlayerA_bp_saved%']
new_df = new_df.reindex(columns=[*new_df.columns.tolist(), *new_columns])

# Limit date before Roland Garros 2018
curr_year = 2018
max_day = 148
#curr_year = 2017
#max_day = 149

# Columns of the players' stats
playerA_cols = [2,3,4,7] + list(range(17,25)) + [36,37,38,39]
playerB_cols = [2,3,4,7] + list(range(28,36)) + [36,37,38,39]

for i, player in players_df.iterrows():
    name = player['PlayerA_Name']
    
    # Take all past matches of PLAYER 1 and look for same id in playerA and playerB
    playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
    playerA_df = df.iloc[playerA_rows, playerA_cols]
    playerA_df['Win'] = 1
    playerB_df = df.iloc[playerB_rows, playerB_cols]
    playerB_df['Win'] = 0
    playerB_df.columns = list(playerA_df)
    tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
    if tmp_df.empty:
        continue
    
    # Compute a weight for each past match of the player
    tmp_df['elapsing_time'] = (curr_year + max_day/365) - (tmp_df['Year'] + tmp_df['Day']/365)
    tmp_df['weight'] = tmp_df['elapsing_time'].apply(lambda t: 0.6**t)
    tmp_df.loc[tmp_df['elapsing_time'] <= 1, 'weight'] = 1
    tmp_df['weight'] = (0.95 * tmp_df['weight']) + (0.05 * (weight_carpet*tmp_df['surface_Carpet'] + weight_clay*tmp_df['surface_Clay'] + weight_grass*tmp_df['surface_Grass'] + weight_hard*tmp_df['surface_Hard']))
    tmp_df.drop(columns=['Year', 'Day', 'elapsing_time', 'surface_Carpet', 'surface_Clay', 'surface_Grass', 'surface_Hard'], inplace = True)

    # Compute the weighted average of the player
    weighted_means = np.average(tmp_df, weights=tmp_df['weight'],axis=0)
    weighted_df = pd.DataFrame(weighted_means.reshape(-1, len(weighted_means)), columns=list(tmp_df.columns))
    weighted_df = weighted_df.drop('weight', axis=1)
    
    # Add stats of the player in new dataframe
    new_df.at[i, 8:] = weighted_df.iloc[0, 2:10]
    new_df.at[i, 'PlayerA_bestof'] = weighted_df['best_of']
    new_df.at[i, 'PlayerA_minutes'] = weighted_df['minutes']
    new_df.at[i, 'PlayerA_Win%'] = weighted_df['Win']
    
# Concat and updating columns names
new_df = pd.concat([players_df, new_df], axis=1)
column_names = [s[8:] for s in list(new_df.columns)]
new_df.columns = column_names

# Fill last missing values by median
new_df.fillna(new_df.median(), inplace=True)
new_df

Unnamed: 0,Name,FR,righthanded,age,rank,rank_points,Win%,bestof,minutes,svpt%,1st_serve%,1st_serve_won%,2nd_serve_won%,ace%,df%,bp_faced%,bp_saved%
1,Andy Murray,0.0,1.0,30.001369,1.0,10360.0,0.805144,3.727227,124.835132,0.49221,0.593023,0.753021,0.539693,0.07148,0.028468,0.06317,0.549156
2,Novak Djokovic,0.0,1.0,29.982204,2.0,6845.0,0.807746,3.666748,114.309573,0.487503,0.65726,0.739374,0.561297,0.055915,0.025421,0.058579,0.582838
3,Stanislas Wawrinka,0.0,1.0,32.150581,3.0,5445.0,0.662957,3.622182,114.000659,0.505912,0.583816,0.740277,0.555047,0.072325,0.026079,0.065869,0.59364
4,Rafael Nadal,0.0,0.0,30.948666,4.0,5195.0,0.852986,3.54803,114.260892,0.471381,0.680937,0.733318,0.600147,0.039176,0.019496,0.059076,0.59228
5,Milos Raonic,0.0,1.0,26.401094,6.0,4360.0,0.701483,3.50926,108.571306,0.513732,0.632042,0.806332,0.565328,0.148934,0.034238,0.040076,0.539961
6,Dominic Thiem,0.0,1.0,23.696098,7.0,4035.0,0.674672,3.422099,109.429873,0.493342,0.592711,0.749961,0.53245,0.069978,0.035567,0.066176,0.561099
7,Marin Cilic,0.0,1.0,28.626968,8.0,3735.0,0.686634,3.521421,112.334895,0.497354,0.574226,0.791357,0.550654,0.109662,0.029681,0.052863,0.5474
8,Kei Nishikori,0.0,1.0,27.394936,9.0,3560.0,0.702708,3.423306,113.641997,0.484209,0.603024,0.716148,0.541136,0.038162,0.028302,0.073776,0.600936
9,Alexander Zverev,0.0,1.0,20.068447,17.0,2300.0,0.723941,3.248545,105.028593,0.505604,0.641236,0.741784,0.539098,0.084871,0.035108,0.060745,0.54819
10,David Goffin,0.0,1.0,26.436687,10.0,3055.0,0.643519,3.32332,106.526121,0.49036,0.58695,0.725708,0.524099,0.064598,0.032991,0.076268,0.568441


In [12]:
new_df.to_csv('_Data/Predictions/stats_players_2017_weight06.csv', sep=',', encoding='utf-8', float_format='%.10f', decimal='.')