In [1]:
import pandas as pd
import numpy as np

def get_direct_link(shared_link):
    return 'https://drive.google.com/uc?export=download&id=' + shared_link.split('/')[-2]

link = "https://drive.google.com/file/d/1ArH8wifpOapYT8vw2dsEe3kuYeVqLVQP/view?usp=drive_link"

direct_link = get_direct_link(link)
df = pd.read_csv(direct_link)

In [2]:
df['date'] = pd.to_datetime(df.date)
df = df.sort_values(by='Date')

In [5]:
df.columns = [column[:-2] if column.endswith('_h_a') else column for column in df.columns]

In [6]:
def wikibuzz(home_pageviews, home_median, away_pageviews, away_median):
    try:
        buzz = np.log(home_pageviews / home_median) - np.log(away_pageviews / away_median)
    except ZeroDivisionError:
        buzz = 'ZeroDivisionError'
    return buzz

df['wikibuzz_h'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_h'], df['wiki_med365_h'],
                                                                                df['wiki_yesterday_a'], df['wiki_med365_a'])]
df['wikibuzz_a'] = [wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_a'], df['wiki_med365_a'],
                                                                                df['wiki_yesterday_h'], df['wiki_med365_h'])]

  buzz = np.log(home_pageviews / home_median) - np.log(away_pageviews / away_median)
  buzz = np.log(home_pageviews / home_median) - np.log(away_pageviews / away_median)
  buzz = np.log(home_pageviews / home_median) - np.log(away_pageviews / away_median)
  buzz = np.log(home_pageviews / home_median) - np.log(away_pageviews / away_median)


In [7]:
df['inverse_B365_h'] = [1/x for x in df['B365H']]
df['inverse_B365_d'] = [1/x for x in df['B365D']]
df['inverse_B365_a'] = [1/x for x in df['B365A']]

In [9]:
df['match_id'] = df.index

df_home = df[['match_id', 'HomeTeam', 'date', 'year', 'wikibuzz_h', 'inverse_B365_h', 'inverse_B365_d', 'inverse_B365_a', 'FTR']].copy()
df_away = df[['match_id', 'AwayTeam', 'date', 'year', 'wikibuzz_a', 'inverse_B365_a', 'inverse_B365_d', 'inverse_B365_h', 'FTR']].copy()

In [10]:
outcome_mapping_h = {'H': 1, 'D': 0, 'A': 0}
outcome_mapping_a = {'A': 1, 'D': 0, 'H': 0}

In [11]:
df_home['outcome'] = df_home['FTR'].map(outcome_mapping_h)
df_away['outcome'] = df_away['FTR'].map(outcome_mapping_a)

In [13]:
df_home.rename({'HomeTeam': 'team', 'wikibuzz_h': 'wikibuzz', 'inverse_B365_h': 'inverse_B365_w', 'inverse_B365_d': 'inverse_B365_d', 'inverse_B365_a': 'inverse_B365_l'}, axis=1, inplace=True)
df_away.rename({'AwayTeam': 'team', 'wikibuzz_a': 'wikibuzz', 'inverse_B365_a': 'inverse_B365_w', 'inverse_B365_d': 'inverse_B365_d', 'inverse_B365_h': 'inverse_B365_l'}, axis=1, inplace=True)
df = pd.concat([df_home, df_away], axis=0)

In [15]:
df = df.loc[df["wikibuzz"] != np.inf]
df = df.loc[df["wikibuzz"] != -np.inf]
df = df.loc[df["wikibuzz"] != 'ZeroDivisionError']

df = df[df['wikibuzz'].notna()]
df = df[df['inverse_B365_w'].notna()]
df = df[df['inverse_B365_d'].notna()]
df = df[df['inverse_B365_l'].notna()]

df.sort_values(by='date', inplace = True)

In [17]:
df.to_csv('football_final.csv', index=False)