In [198]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import numpy as np

In [3]:
# scrape main wiki page for The Amazing Race US
url = "https://amazingrace.fandom.com/wiki/The_Amazing_Race_(US)"
req = requests.get(url)
soup = bs(req.content, 'html.parser')

In [4]:
# get the ends of the urls for each of the US seasons using the table at the bottom of the page
seasons_table = soup.find('table', class_=lambda s:s.startswith("wikitable mw-collapsible"))
season_url_tails = [t['href'] for t in seasons_table.td.find_all('a')]

In [59]:
teams = []
leg_rankings = [[] for _ in range(13)]

In [217]:
season_url_head = "https://amazingrace.fandom.com"

dfs = []
for season_url_tail in season_url_tails:
    season_url = season_url_head + season_url_tail
    req = requests.get(season_url)
    soup = bs(req.content, 'html.parser')

    # get the leaderboard table for a specific season
    leaderboard_table = soup.find_all('table', attrs={'class' : 'wikitable', 
                                                    'style' : "margin:auto; text-align:center; font-size: 8pt; line-height:18px;"})[1]
    df = pd.read_html(str(leaderboard_table))[0]


    if type(df.columns) == pd.core.indexes.multi.MultiIndex:  # fixing tables with multiple column headers
        df.columns = ['_'.join(col) for col in df.columns]
    elif type(df.columns[0]) == np.int64:  # fixing tables with an extra header row
        df.columns = df.iloc[0]
        df = df.drop(np.nan, axis=1)

    # keep only rows corresponding to each team
    df = df[df.iloc[:,0].apply(lambda s: s.isnumeric())]

    # keep only columns with season placement, team members, and leg placements
    relavant_cols = [s for s in df.columns if s[0] in ['P', 'T', 'L']]  
    df = df[relavant_cols]

    dfs.append(df)

In [233]:
num_seasons = len(dfs)
for i in range(num_seasons):
    df = dfs[i]

    # standardize column names to prepare for concatinating the dataframes
    num_cols = len(df.columns)
    col_names = ['Place', 'Team']
    col_names.extend([f'Leg {i}' for i in range(1, num_cols-1)])
    df.columns = col_names

    # add in season number and number of teams for that season
    num_teams = len(df)
    df['Season'] = (i+1) * np.ones(num_teams)
    df['Number of teams'] = num_teams * np.ones(num_teams)

In [234]:
df = pd.concat(dfs, ignore_index=True)

In [235]:
df

Unnamed: 0,Place,Team,Leg 1,Leg 2,Leg 3,Leg 4,Leg 5,Leg 6,Leg 7,Leg 8,Leg 9,Leg 10,Leg 11,Leg 12,Leg 13,Season,Number of teams
0,1,Rob & Brennan,1►,3,3,6,4,3,3,3,1,1,2,1,1,1.0,11.0
1,2,Frank & Margarita,3,4,2,4,5,1,1►,2,2,2,1,2,2,1.0,11.0
2,3,Joe & Bill,2,2,4,1,2,2,2,4,4►,4,3,3,3,1.0,11.0
3,4,Kevin & Drew,9,5,1►,2,1,4,4,1,3,3,4,,,1.0,11.0
4,5,Nancy & Emily,10,8,8,3,3,5,5,5,5,,,,,1.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,9,Liam & Yeremi,8,8,11,6,8,9,,,,,,,,35.0,13.0
387,10,Joe & Ian,9,7,7,4,10,,,,,,,,,35.0,13.0
388,11,Jocelyn & Victor,1,1,10,11,,,,,,,,,,35.0,13.0
389,12,Elizabeth & Iliana,12,12,,,,,,,,,,,,35.0,13.0
