In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import numpy as np
import os

In [34]:
# scrape main wiki page for The Amazing Race US
url = "https://amazingrace.fandom.com/wiki/The_Amazing_Race_(US)"
req = requests.get(url)
soup = bs(req.content, 'html.parser')

In [35]:
# get the ends of the urls for each of the US seasons using the table at the bottom of the page
seasons_table = soup.find('table', class_=lambda s:s.startswith("wikitable mw-collapsible"))
season_url_tails = [t['href'] for t in seasons_table.td.find_all('a')]

In [36]:
teams = []
leg_rankings = [[] for _ in range(13)]

In [37]:
season_url_head = "https://amazingrace.fandom.com"

dfs = []
for season_url_tail in season_url_tails:
    season_url = season_url_head + season_url_tail
    req = requests.get(season_url)
    soup = bs(req.content, 'html.parser')

    # get the leaderboard table for a specific season
    leaderboard_table = soup.find_all('table', attrs={'class' : 'wikitable', 
                                                    'style' : "margin:auto; text-align:center; font-size: 8pt; line-height:18px;"})[1]
    df = pd.read_html(str(leaderboard_table))[0]


    if type(df.columns) == pd.core.indexes.multi.MultiIndex:  # fixing tables with multiple column headers
        df.columns = ['_'.join(col) for col in df.columns]
    elif type(df.columns[0]) == np.int64:  # fixing tables with an extra header row
        df.columns = df.iloc[0]
        df = df.drop(np.nan, axis=1)

    # keep only rows corresponding to each team
    df = df[df.iloc[:,0].apply(lambda s: s.isnumeric())]

    # keep only columns with season placement, team members, and leg placements
    relavant_cols = [s for s in df.columns if s[0] in ['P', 'T', 'L']]  
    df = df[relavant_cols]

    dfs.append(df)

In [38]:
num_seasons = len(dfs)
for i in range(num_seasons):
    df = dfs[i]

    # standardize column names to prepare for concatinating the dataframes
    num_cols = len(df.columns)
    col_names = ['Place', 'Team']
    col_names.extend([f'Leg {i}' for i in range(1, num_cols-1)])
    df.columns = col_names

    # add in season number and number of teams for that season
    num_teams = len(df)
    df['Season'] = (i+1) * np.ones(num_teams)
    df['Number of teams'] = num_teams * np.ones(num_teams)

In [39]:
df = pd.concat(dfs, ignore_index=True)

In [40]:
df = df.replace({"Dis": np.nan})

In [41]:
def get_numeric(x):
    if type(x) == float:
        return np.nan
    else:
        return re.findall('\d+', x)[0]

for col in df.columns:
    if col.startswith("Leg"):
        df[col] = df[col].apply(get_numeric)

In [42]:
df['Season'] = df['Season'].astype(int)
df['Number of teams'] = df['Number of teams'].astype(int)

In [46]:
os.mkdir('data')
df.to_csv('data/placements.csv', index=False)

In [3]:
df = pd.read_csv('data/placements.csv')

In [7]:
leg_cols = [c for c in df.columns if c[0] == 'L']

In [8]:
for leg_col in leg_cols:
    df[f'{leg_col} normalized'] = df[leg_col]/df['Number of teams']

In [11]:
df['Place normalized'] = df['Place']/df['Number of teams']

In [29]:
np.random.seed(20)
season_shuffle = np.random.permutation(np.arange(1,36))

In [33]:
df_test = df[df['Season'].isin(season_shuffle[0:7])]
df_valid = df[df['Season'].isin(season_shuffle[8:14])]
df_train = df[df['Season'].isin(season_shuffle[15:])]

In [34]:
df_train['Season'].unique()

array([ 1,  3,  7,  8,  9, 10, 12, 16, 17, 19, 20, 21, 23, 27, 29, 30, 31,
       32, 33, 35], dtype=int64)