In [None]:
def get_last_stat(players_file, matches_file):
    """
    Get the latest stats fromm the match_file of the players in players_file.
    """
    # Read the csv files
    df = pd.read_csv(matches_file, header=0, index_col=0)
    players_df = pd.read_csv(players_file, header=0, index_col=0)
    
    # Create new dataframe to contain stats of the 128 players
    index = np.array(np.arange(1,129))
    columns = ['PlayerA_FR',
              'PlayerA_righthanded',
               'PlayerA_age',
               'PlayerA_rank',
               'PlayerA_rank_points',
               'PlayerA_Win%',
               'PlayerA_bestof',
               'PlayerA_minutes',
               'PlayerA_svpt%',
               'PlayerA_1st_serve%',
               'PlayerA_1st_serve_won%',
               'PlayerA_2nd_serve_won%',
               'PlayerA_ace%',
               'PlayerA_df%',
               'PlayerA_bp_faced%',
               'PlayerA_bp_saved%']
    new_df = pd.DataFrame(index=index, columns=columns)
    
    # Limit date before Roland Garros 2018
    curr_year = 2018
    max_day = 148

    playerA_cols = [2,3,9,11] + list(range(13,27))
    playerB_cols = [2,3,10,12] + list(range(27,41))

    for i, player in players_df.iterrows():
        name = player['PlayerA_Name']

        # Take all past matches of that player looking for the name in playerA and playerB
        playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
        playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
        playerA_df = df.iloc[playerA_rows, playerA_cols]
        playerB_df = df.iloc[playerB_rows, playerB_cols]
        playerB_df.columns = list(playerA_df)
        tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
        if tmp_df.empty:
            continue

        # Sort by latest date
        tmp_df.sort_values(by=['Year', 'Day'], ascending=[False, False], inplace=True)

        # Add latest stats just before French Open 2018 in the new df
        new_df.iloc[i-1,:] = tmp_df.iloc[0,:]
        
    # Concat and updating columns names
    new_df = pd.concat([players_df, new_df],axis=1)
    column_names = [s[8:] for s in list(new_df.columns)]
    new_df.columns = column_names
    
    # Save dataset
    new_df.to_csv('Data/stats_players_2018_with_nan.csv', sep=',', encoding='utf-8', float_format='%.10f', decimal='.')


In [None]:
def fill_in_missing_stats(all_games_file, missing_stats_2018_file):
    """
    Fill in missing values of 2018 stats players.
    """
    # Read the csv files
    df = pd.read_csv(all_games_file, header=0, index_col=0)
    players_df = pd.read_csv('Data/stats_players_2018_with_nan.csv', header=0, index_col=0)

    # Limit date before Roland Garros 2018
    curr_year = 2018
    max_day = 146

    playerA_cols = [2,3,10,12,14,15,16]
    playerB_cols = [2,3,11,13,25,26,27]

    # For each player that misses his stats
    for i, player in players_df[players_df['age'].isna()].iterrows():
        name = player['Name']

        # Take all past matches of that player looking for the name in playerA and playerB
        playerA_rows = df.index[(df['PlayerA_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
        playerB_rows = df.index[(df['PlayerB_name'] == name) & (df['Year'] + df['Day']/365 < curr_year + max_day/365)].tolist()
        playerA_df = df.iloc[playerA_rows, playerA_cols]
        playerB_df = df.iloc[playerB_rows, playerB_cols]
        playerB_df.columns = list(playerA_df)
        tmp_df = pd.concat([playerA_df, playerB_df], ignore_index=True)
        if tmp_df.empty:
            continue

        # Sort by latest date
        tmp_df.sort_values(by=['Year', 'Day'], ascending=[False, False], inplace=True)

        # Add part of player info in dataset
        tmp_df.columns = ['Year', 'Day', 'FR','righthanded', 'age', 'rank', 'rank_points']
        players_df.at[i,1:6] = tmp_df.iloc[0,2:]
        
    # For each player that misses his stats, make a mean of the players with closest rank_points values
    for i, player in players_df[(players_df['Win%'].isna()) & (players_df['age'].notnull())].iterrows():
        player_points = player['rank_points']
        tmp_df = players_df.dropna()
        df_sort = tmp_df.iloc[(tmp_df['rank_points']-player_points).abs().argsort()[:5]]
        players_df.at[i,6:] = df_sort.iloc[:,6:].mean()
    
    # Fill last missing values by median
    players_df.fillna(players_df.median(), inplace=True)
    
    players_df.to_csv('Data/stats_players_2018.csv', sep=',', encoding='utf-8', float_format='%.10f', decimal='.')


In [None]:
if __name__ == "__main__":
    
    
    # Get most recent stats of players
    #get_last_stat(players_file, matches_file)
    
    # Fill in missing values
    fill_in_missing_stats(all_games_file, missing_stats_2018_file)
