In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=UnicodeWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def get_direct_link(shared_link):
    return 'https://drive.google.com/uc?export=download&id=' + shared_link.split('/')[-2]

link = "https://drive.google.com/file/d/1DXSKofF4c3YYFXybH1slRyPascey8Bt1/view?usp=drive_link"

direct_link = get_direct_link(link)
df = pd.read_stata(direct_link)

In [40]:
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

In [8]:
def rankdist(rank_w: int = 0, rank_l: int = 0) -> float:
    if np.isnan(rank_w):
        inv_w = 0
    else:
        inv_w = 1/rank_w
    if np.isnan(rank_l):
        inv_l = 0
    else:
        inv_l = 1/rank_l
      
    rankdist = - (inv_w - inv_l)
    
    return rankdist

df['rankdist_w'] = [rankdist(w,l) for w,l in zip(df['wrank'], df['lrank'])]
df['rankdist_l'] = [rankdist(l,w) for w,l in zip(df['wrank'], df['lrank'])]

In [9]:
def wikibuzz(winner_pageviews, winner_median, loser_pageviews, loser_median):
    try:
        buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)
    except ZeroDivisionError:
        buzz = 'ZeroDivisionError'
    return buzz

df['wikibuzz_w'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_w'], df['wiki_med365_w'],
                                                                                df['wiki_yesterday_l'], df['wiki_med365_l'])]
df['wikibuzz_l'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_l'], df['wiki_med365_l'],
                                                                                df['wiki_yesterday_w'], df['wiki_med365_w'])]

In [10]:
df['inverse_bestodds_w'] = [1/x for x in df['maxw']]
df['inverse_bestodds_l'] = [1/x for x in df['maxl']]
df['inverse_avg_w'] = [1/x for x in df['avgw']]
df['inverse_avg_l'] = [1/x for x in df['avgl']]
df['inverse_B365_w'] = [1/x for x in df['b365w']]
df['inverse_B365_l'] = [1/x for x in df['b365l']]

In [11]:
df['match_id'] = df.index

df_winners = df[['match_id', 'winner', 'date', 'year', 'rankdist_w', 'wikibuzz_w', 'inverse_B365_w', 'inverse_avg_w', 'inverse_bestodds_w']].copy()
df_winners['outcome'] = 1
df_losers = df[['match_id', 'loser', 'date', 'year', 'rankdist_l', 'wikibuzz_l', 'inverse_B365_l', 'inverse_avg_l', 'inverse_bestodds_l']].copy()
df_losers['outcome'] = 0

df_winners.rename({'winner': 'player', 'rankdist_w': 'rankdist', 'wikibuzz_w': 'wikibuzz',
                   'inverse_B365_w': 'inverse_B365', 'inverse_avg_w': 'inverse_avg', 'inverse_bestodds_w': 'inverse_best'}, axis=1, inplace=True)
df_losers.rename({'loser': 'player', 'rankdist_l': 'rankdist', 'wikibuzz_l': 'wikibuzz',
                   'inverse_B365_l': 'inverse_B365', 'inverse_avg_l': 'inverse_avg', 'inverse_bestodds_l': 'inverse_best'}, axis=1, inplace=True)
df = pd.concat([df_winners, df_losers], axis=0)

In [13]:
df = df.loc[df["wikibuzz"] != np.inf]
df = df.loc[df["wikibuzz"] != -np.inf]
df = df.loc[df["wikibuzz"] != 'ZeroDivisionError']

df = df[df['wikibuzz'].notna()]
df = df[df['rankdist'].notna()]
df = df[df['inverse_avg'].notna()]

df.sort_values(by='date', inplace = True)

In [16]:
df.to_csv('their_final_data.csv', index=False)