In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_direct_link(shared_link):
    return 'https://drive.google.com/uc?export=download&id=' + shared_link.split('/')[-2]

link = "https://drive.google.com/file/d/1IGqTjvCbZy5qRVbYRdv_H9av3wg-PLFL/view?usp=drive_link"
direct_link = get_direct_link(link)
df = pd.read_csv(direct_link)

In [3]:
df['date'] = pd.to_datetime(df.date)
df = df.sort_values(by='Date')

In [6]:
df.columns = [column[:-2] if column.endswith('_w_l') else column for column in df.columns]

In [7]:
def rankdist(rank_w, rank_l):
    if np.isnan(rank_w):
        inv_w = 0
    else:
        inv_w = 1/rank_w
    if np.isnan(rank_l):
        inv_l = 0
    else:
        inv_l = 1/rank_l
      
    rankdist = - (inv_w - inv_l)
    
    return rankdist

df['rankdist_w'] = [rankdist(w,l) for w,l in zip(df['WRank'], df['LRank'])]
df['rankdist_l'] = [rankdist(l,w) for w,l in zip(df['WRank'], df['LRank'])]

In [9]:
def wikibuzz(winner_pageviews, winner_median, loser_pageviews, loser_median):
    try:
        buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)
    except ZeroDivisionError:
        buzz = 'ZeroDivisionError'
    return buzz

df['wikibuzz_w'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_w'], df['wiki_med365_w'],
                                                                                df['wiki_yesterday_l'], df['wiki_med365_l'])]
df['wikibuzz_l'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_l'], df['wiki_med365_l'],
                                                                                df['wiki_yesterday_w'], df['wiki_med365_w'])]

  buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)
  buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)


In [11]:
df['inverse_bestodds_w'] = [1/x for x in df['MaxW']]
df['inverse_bestodds_l'] = [1/x for x in df['MaxL']]
df['inverse_avg_w'] = [1/x for x in df['AvgW']]
df['inverse_avg_l'] = [1/x for x in df['AvgL']]
df['inverse_B365_w'] = [1/x for x in df['B365W']]
df['inverse_B365_l'] = [1/x for x in df['B365L']]

In [12]:
df.Date = pd.to_datetime(df.Date, format='%Y-%m-%d')

In [14]:
df['match_id'] = df.index

df_winners = df[['match_id', 'Winner', 'Date', 'Year', 'rankdist_w', 'wikibuzz_w', 'inverse_B365_w', 'inverse_avg_w', 'inverse_bestodds_w']].copy()
df_winners['outcome'] = 1
df_losers = df[['match_id', 'Loser', 'Date', 'Year', 'rankdist_l', 'wikibuzz_l', 'inverse_B365_l', 'inverse_avg_l', 'inverse_bestodds_l']].copy()
df_losers['outcome'] = 0

df_winners.rename({'Winner': 'player', 'rankdist_w': 'rankdist', 'wikibuzz_w': 'wikibuzz',
                   'inverse_B365_w': 'inverse_B365', 'inverse_avg_w': 'inverse_avg', 'inverse_bestodds_w': 'inverse_best'}, axis=1, inplace=True)
df_losers.rename({'Loser': 'player', 'rankdist_l': 'rankdist', 'wikibuzz_l': 'wikibuzz',
                   'inverse_B365_l': 'inverse_B365', 'inverse_avg_l': 'inverse_avg', 'inverse_bestodds_l': 'inverse_best'}, axis=1, inplace=True)

In [16]:
df = pd.concat([df_winners, df_losers], axis=0)

In [18]:
df = df.loc[df["wikibuzz"] != np.inf]
df = df.loc[df["wikibuzz"] != -np.inf]
df = df.loc[df["wikibuzz"] != 'ZeroDivisionError']

df = df[df['wikibuzz'].notna()]
df = df[df['rankdist'].notna()]
df = df[df['inverse_avg'].notna()]
df = df[df['inverse_B365'].notna()]

df.sort_values(by='Date', inplace = True)

In [21]:
df.to_csv('unclean_final.csv', index=False)