In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_direct_link(shared_link):
    return 'https://drive.google.com/uc?export=download&id=' + shared_link.split('/')[-2]

link = "https://drive.google.com/file/d/1Aw6GPSBIVI1TQDEkEH5nRxBxm0SBfBR4/view?usp=drive_link"
direct_link = get_direct_link(link)
df = pd.read_csv(direct_link)

In [3]:
df['date'] = pd.to_datetime(df.date)
df = df.sort_values(by='Date')

In [6]:
df.columns = [column[:-2] if column.endswith('_w_l') else column for column in df.columns]

In [7]:
def rankdist(rank_w, rank_l):
    if np.isnan(rank_w):
        inv_w = 0
    else:
        inv_w = 1/rank_w
    if np.isnan(rank_l):
        inv_l = 0
    else:
        inv_l = 1/rank_l
      
    rankdist = - (inv_w - inv_l)
    
    return rankdist

df['rankdist_w'] = [rankdist(w,l) for w,l in zip(df['WRank'], df['LRank'])]
df['rankdist_l'] = [rankdist(l,w) for w,l in zip(df['WRank'], df['LRank'])]

In [8]:
df.shape

(17550, 28)

In [9]:
def wikibuzz(winner_pageviews, winner_median, loser_pageviews, loser_median):
    try:
        buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)
    except ZeroDivisionError:
        buzz = 'ZeroDivisionError'
    return buzz

df['wikibuzz_w'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_w'], df['wiki_med365_w'],
                                                                                df['wiki_yesterday_l'], df['wiki_med365_l'])]
df['wikibuzz_l'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_l'], df['wiki_med365_l'],
                                                                                df['wiki_yesterday_w'], df['wiki_med365_w'])]

  buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)
  buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)


In [10]:
df.shape

(17550, 30)

In [11]:
df['inverse_bestodds_w'] = [1/x for x in df['MaxW']]
df['inverse_bestodds_l'] = [1/x for x in df['MaxL']]
df['inverse_avg_w'] = [1/x for x in df['AvgW']]
df['inverse_avg_l'] = [1/x for x in df['AvgL']]
df['inverse_B365_w'] = [1/x for x in df['B365W']]
df['inverse_B365_l'] = [1/x for x in df['B365L']]

In [12]:
df.Date = pd.to_datetime(df.Date, format='%Y-%m-%d')

In [13]:
df.shape

(17550, 36)

In [14]:
df['match_id'] = df.index

df_winners = df[['match_id', 'Winner', 'Date', 'Year', 'rankdist_w', 'wikibuzz_w', 'inverse_B365_w', 'inverse_avg_w', 'inverse_bestodds_w']].copy()
df_winners['outcome'] = 1
df_losers = df[['match_id', 'Loser', 'Date', 'Year', 'rankdist_l', 'wikibuzz_l', 'inverse_B365_l', 'inverse_avg_l', 'inverse_bestodds_l']].copy()
df_losers['outcome'] = 0

df_winners.rename({'Winner': 'player', 'rankdist_w': 'rankdist', 'wikibuzz_w': 'wikibuzz',
                   'inverse_B365_w': 'inverse_B365', 'inverse_avg_w': 'inverse_avg', 'inverse_bestodds_w': 'inverse_best'}, axis=1, inplace=True)
df_losers.rename({'Loser': 'player', 'rankdist_l': 'rankdist', 'wikibuzz_l': 'wikibuzz',
                   'inverse_B365_l': 'inverse_B365', 'inverse_avg_l': 'inverse_avg', 'inverse_bestodds_l': 'inverse_best'}, axis=1, inplace=True)

In [15]:
df_winners

Unnamed: 0,match_id,player,Date,Year,rankdist,wikibuzz,inverse_B365,inverse_avg,inverse_best,outcome
3460,3460,Mattek-Sands B.,2015-07-01,2015,0.136528,,0.250000,0.255102,0.239234,1
16330,16330,Azarenka V.,2015-07-01,2015,-0.031250,ZeroDivisionError,0.925926,0.934579,0.909091,1
1485,1485,Petkovic A.,2015-07-01,2015,-0.061328,,0.909091,0.900901,0.892857,1
3200,3200,Bencic B.,2015-07-01,2015,-0.033960,,0.862069,0.869565,0.854701,1
6682,6682,Kulichkova E.,2015-07-01,2015,0.002189,,0.444444,0.444444,0.416667,1
...,...,...,...,...,...,...,...,...,...,...
2437,2437,Rus A.,2023-07-29,2023,-0.011836,3.307854,0.735294,0.735294,0.699301,1
6471,6471,Cocciaretto E.,2023-07-29,2023,-0.017358,ZeroDivisionError,0.636943,0.632911,0.595238,1
7632,7632,Swiatek I.,2023-07-30,2023,-0.990826,-0.460734,0.961538,0.970874,0.952381,1
7633,7633,Swiatek I.,2023-07-30,2023,-0.993464,-1.590987,0.952381,0.961538,0.952381,1


In [16]:
df = pd.concat([df_winners, df_losers], axis=0)

In [17]:
df.shape

(35100, 10)

In [18]:
df = df.loc[df["wikibuzz"] != np.inf]
df = df.loc[df["wikibuzz"] != -np.inf]
df = df.loc[df["wikibuzz"] != 'ZeroDivisionError']

df = df[df['wikibuzz'].notna()]
df = df[df['rankdist'].notna()]
df = df[df['inverse_avg'].notna()]
df = df[df['inverse_B365'].notna()]

df.sort_values(by='Date', inplace = True)

In [19]:
df.shape

(32226, 10)

In [20]:
df

Unnamed: 0,match_id,player,Date,Year,rankdist,wikibuzz,inverse_B365,inverse_avg,inverse_best,outcome
9107,9107,Pliskova K.,2015-07-02,2015,0.032537,-2.164335,0.250000,0.242131,0.215054,1
15520,15520,Duan Y.Y.,2015-07-02,2015,0.004274,1.140344,0.363636,0.369004,0.347222,0
4324,4324,Svitolina E.,2015-07-02,2015,-0.042430,2.274586,0.546448,0.526316,0.487805,0
12904,12904,Cornet A.,2015-07-02,2015,-0.028840,2.163103,0.602410,0.595238,0.568182,0
4156,4156,Allertova D.,2015-07-02,2015,0.187952,0.993304,0.111111,0.115607,0.105263,0
...,...,...,...,...,...,...,...,...,...,...
10391,10391,Stefanini L.,2023-07-29,2023,-0.002723,-1.355923,0.454545,0.446429,0.418410,0
7633,7633,Swiatek I.,2023-07-30,2023,-0.993464,-1.590987,0.952381,0.961538,0.952381,1
7632,7632,Swiatek I.,2023-07-30,2023,-0.990826,-0.460734,0.961538,0.970874,0.952381,1
7632,7632,Wickmayer Y.,2023-07-30,2023,0.990826,0.460734,0.076923,0.078247,0.054855,0


In [21]:
df.to_csv(data_dir + 'final.csv', index=False)