Scraping [derby names](https://en.wikipedia.org/wiki/Roller_derby#Derby_names) from publicly-accessible lists

In [80]:
import sys
!{sys.executable} -m pip install -qq pandas requests bs4 lxml advertools html5lib tqdm

In [81]:
import string
import random
from datetime import datetime
from pathlib import Path

from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd
import advertools as adv
from tqdm import tqdm

# from tqdm.auto import tqdm


In [82]:
if "google.colab" in sys.modules:
    from google.colab import files
    from sklearn.utils import shuffle


In [83]:
session = requests.Session()


In [84]:
try:
    wftda_df = pd.DataFrame()
    url = "https://resources.wftda.org/officiating/roller-derby-certification-program-for-officials/roster-of-certified-officials/"
    print("Downloading names from {}".format(url))
    session.headers.update({"User-Agent": "Mozilla/5.0"})
    r = session.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    rows = soup.find_all("h5")
    urls = [r.find("a")["href"] for r in rows]
    names = [r.find("a").get_text() for r in rows]
    wftda_df = pd.DataFrame({"Name": names, "url": urls})
except Exception as e:
    print(e)


Downloading names from https://resources.wftda.org/officiating/roller-derby-certification-program-for-officials/roster-of-certified-officials/


In [85]:
if len(wftda_df) == 0:
    wftda_df = pd.read_csv("wftda.csv")
else:
    wftda_df.to_csv("wftda.csv", index=False)
wftda_df


Unnamed: 0,Name,url
0,9mm Ram-Paige,https://resources.wftda.org/officiating/roller...
1,A. Grue,https://resources.wftda.org/officiating/roller...
2,A’Blazing Grace,https://resources.wftda.org/officiating/roller...
3,Adam Smasher,https://resources.wftda.org/officiating/roller...
4,Admiral Mayhem,https://resources.wftda.org/officiating/roller...
...,...,...
546,Yu Cypher,https://resources.wftda.org/officiating/roller...
547,Yvel Saint Laurent,https://resources.wftda.org/officiating/roller...
548,Zebra 3,https://resources.wftda.org/officiating/roller...
549,Zero,https://resources.wftda.org/officiating/roller...


In [86]:
url = "https://www.twoevils.org/rollergirls/"
print("Downloading names from %s" % url)
twoevils_df = pd.DataFrame()

try:
    twoevils_df = pd.read_html(url, skiprows=1)[0]
    twoevils_df.columns = [h.replace("Skater", "").strip() for h in twoevils_df.iloc[0]]
    twoevils_df = twoevils_df.rename(columns={"Date Added": "Registered"})
    twoevils_df = twoevils_df.iloc[1:-1, :].dropna(how="all")
    twoevils_df["url"] = url
except Exception as e:
    print(e)


Downloading names from https://www.twoevils.org/rollergirls/
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>


In [87]:
if len(twoevils_df) == 0:
    twoevils_df = pd.read_csv("twoevils.csv")
else:
    twoevils_df.to_csv("twoevils.csv", index=False)
twoevils_df


Unnamed: 0,Name,Number,Registered,League,url
0,!(ED,REF,2012-02-20,Santiago Roller Derby,https://www.twoevils.org/rollergirls/
1,!Wolfespit,16,2011-02-01,Chilli Padi Derby Grrrls,https://www.twoevils.org/rollergirls/
2,"""A"" Cup Annihilator",36A,2010-10-13,Billings Roller Derby,https://www.twoevils.org/rollergirls/
3,"""Bobby"" Val Halen",1984,2011-02-24,Roller Derby Quebec,https://www.twoevils.org/rollergirls/
4,"""Chupa'clark'bra""",333,2012-01-09,Killa Hurtz Roller Girls,https://www.twoevils.org/rollergirls/
...,...,...,...,...,...
40536,Zuul,REF,2008-04-13,Orange County Roller Girls,https://www.twoevils.org/rollergirls/
40537,Zuzi Power,33,2012-04-20,White Horse Warriors,https://www.twoevils.org/rollergirls/
40538,Zwen Garden,E13,2011-03-24,Deja Vu Roller Derby,https://www.twoevils.org/rollergirls/
40539,Zyklon C,HCN1,2010-03-08,Croydon Roller Derby,https://www.twoevils.org/rollergirls/


In [88]:
try:
    drc_df = pd.DataFrame()
    url = "http://www.derbyrollcall.com/everyone"
    print("Downloading names from %s" % url)
    drc_df = pd.concat(pd.read_html(url))
    drc_df = drc_df.rename(columns={"#": "Number"})
    drc_df["url"] = url
except Exception as e:
    print(e)


Downloading names from http://www.derbyrollcall.com/everyone
HTTP Error 503: Service Unavailable


In [89]:
if len(drc_df) == 0:
    drc_df = pd.read_csv("derbyrollcall.csv")
else:
    drc_df.to_csv("derbyrollcall.csv", index=False)
drc_df


Unnamed: 0,Name,Number,League,Country,Registered,url
0,Sausage Roller,M14,Arcadia Roller Derby,United Kingdom,"1st January, 1970",http://www.derbyrollcall.com/everyone
1,James Mean,27,Manchester Roller Derby,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
2,Yvel Saint Laurent,14,Knights of Oldham Roller Derby & Rainy City Ro...,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
3,Pancake,1928,Rainy City Roller Derby,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
4,Robert Quadriguez,101,Crash Test Brummies,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
...,...,...,...,...,...,...
38861,Sinister Sixx,6,,United States,"8th August, 2022",http://www.derbyrollcall.com/everyone
38862,Bust'yer Bubbles,,SRDL,Canada,"9th August, 2022",http://www.derbyrollcall.com/everyone
38863,Kimical Burn,,,United States,"10th August, 2022",http://www.derbyrollcall.com/everyone
38864,Rumbleteaser,26,Rock Coast Roller Derby,United States,"10th August, 2022",http://www.derbyrollcall.com/everyone


In [90]:
rdn_sitemap_url = "https://rdnation.com/sitemap.xml"
rdn_sitemaps = adv.sitemap_to_df(rdn_sitemap_url)
rdn_sitemaps["is_league"] = (
    rdn_sitemaps["loc"].str.contains("roller-derby-league/").fillna(False)
)


2022-08-31 14:13:43,438 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://rdnation.com/sitemaps?p=0


In [91]:
# rdn_league_urls = sorted(rdn_sitemaps[rdn_sitemaps['is_league']]['loc'].tolist())
rdn_league_urls = sorted(
    rdn_sitemaps[
        rdn_sitemaps["is_league"]
        & (
            rdn_sitemaps["loc"].str.contains("/2/")
            | rdn_sitemaps["loc"].str.contains("/1/")
        )
    ]["loc"].tolist()
)
# rdn_league_urls


In [92]:
rdn_df = pd.DataFrame()


In [93]:
pbar = tqdm(rdn_league_urls)

for url in pbar:
    try:
        # pbar.set_description(f"Processing {url}")
        # pbar.set_description(f"{len(rdn_df)} names found - Processing {url}")
        pbar.set_description(f"{len(rdn_df)} names found")
        league_df = pd.concat(pd.read_html(url), ignore_index=True)
        league_df.rename(columns={"Derby Name": "Name"}, inplace=True)
        league_df["url"] = url
        # print(league_df)
        rdn_df = pd.concat([rdn_df, league_df])
        # print(f'Found {len(league_df)} names at {url} - {len(rdn_df)} total names')
    except ValueError as e:
        # print(f'No names found at {url}')
        continue


37545 names found: 100%|██████████| 1389/1389 [07:27<00:00,  3.11it/s]


In [99]:
if len(rdn_df) == 0:
    rdn_df = pd.read_csv("rdnation.csv")
else:
    rdn_df.dropna(how="all", axis="columns", inplace=True)
    rdn_df.drop_duplicates(subset=['Name','Number'], inplace=True)
    rdn_df.sort_values(by="Name", inplace=True)
    rdn_df.to_csv("rdnation.csv", index=False)
rdn_df


Unnamed: 0,Name,Number,url
5,!Wolfespit,16,https://rdnation.com/roller-derby-league/2/chi...
1,$3 bill,STAFF,https://rdnation.com/roller-derby-league/1/car...
0,'Lil Diablo,72,https://rdnation.com/roller-derby-league/2/ham...
1,.357 Maggie,6040,https://rdnation.com/roller-derby-league/1/cha...
11,.38 SpeShell,38,https://rdnation.com/roller-derby-league/1/lav...
...,...,...,...
14,Zulu Mother Smother,360,https://rdnation.com/roller-derby-league/1/m-i...
10,Zulu Xray,911,https://rdnation.com/roller-derby-league/1/rol...
88,Zuul,REF,https://rdnation.com/roller-derby-league/1/ora...
1,Zyklon C,HCN1,https://rdnation.com/roller-derby-league/2/cro...


In [100]:
def get_page_names(initial_letter, timeout=30):
    temp_names = []
    url = "https://rollerderbyroster.com/view-names/?ini={}".format(letter)
    # print("Downloading names from {}".format(url))
    try:
        response = session.get(url=url, timeout=timeout)
        r = session.get(url)
        soup = BeautifulSoup(r.text, "lxml")
        rows = soup.find_all("ul")
        # Use only last unordered list - this is where names are!
        for idx, li in enumerate(rows[-1]):
            # Name should be the text of the link within the list item
            name = li.find("a").get_text()
            temp_names.append(name)
    except requests.Timeout:
        print("Timeout!")
        pass
    return temp_names


In [101]:
initial_letters = string.ascii_letters + string.digits + string.punctuation
rdr_df = pd.DataFrame()


In [102]:
pbar = tqdm(initial_letters)
for letter in pbar:
    try:
        pbar.set_description(f"{len(rdr_df)} names found - Processing {letter}")
        temp_names = get_page_names(initial_letter=letter)
        temp_df = pd.DataFrame(data={"Name": temp_names, "url": url})
        rdr_df = pd.concat([rdr_df, temp_df])
    except Exception as e:
        print(e)
        pass


82551 names found - Processing &:  71%|███████▏  | 67/94 [06:48<04:38, 10.32s/it]

Timeout!


82584 names found - Processing `:  95%|█████████▍| 89/94 [08:03<00:52, 10.42s/it]

Timeout!


82584 names found - Processing ~: 100%|██████████| 94/94 [08:17<00:00,  5.29s/it]


In [103]:
if len(rdr_df) == 0:
    rdr_df = rdn_df.sort_values(by="Name").drop_duplicates(subset=['Name'])
    rdr_df = pd.read_csv("rollerderbyroster.csv")
else:
    rdr_df.to_csv("rollerderbyroster.csv", index=False)
rdr_df


Unnamed: 0,Name,url
0,A B Old Tricksee,https://rdnation.com/roller-derby-league/2/yuk...
1,A Blonde with No Name,https://rdnation.com/roller-derby-league/2/yuk...
2,A Bomb,https://rdnation.com/roller-derby-league/2/yuk...
3,A Bout To Snap,https://rdnation.com/roller-derby-league/2/yuk...
4,A Boy Named Rob,https://rdnation.com/roller-derby-league/2/yuk...
...,...,...
3,.50 Cal,https://rdnation.com/roller-derby-league/2/yuk...
4,.50 Caliber Killer,https://rdnation.com/roller-derby-league/2/yuk...
5,.50 Kal Kitten,https://rdnation.com/roller-derby-league/2/yuk...
6,.CC Rip Jaw,https://rdnation.com/roller-derby-league/2/yuk...


In [104]:
name_df = pd.concat([twoevils_df, drc_df, rdr_df, wftda_df, rdn_df], ignore_index=True)
# remove parenthetical phrases from names - eg "(cleared)"
name_df["Name"] = name_df["Name"].str.replace(r"\([^()]*\)", "").str.strip()
name_df = name_df.loc[name_df["Name"].str.len() > 1]
name_df


  name_df["Name"] = name_df["Name"].str.replace(r"\([^()]*\)", "").str.strip()


Unnamed: 0,Name,Number,Registered,League,url,Country
0,!(ED,REF,2012-02-20,Santiago Roller Derby,https://www.twoevils.org/rollergirls/,
1,!Wolfespit,16,2011-02-01,Chilli Padi Derby Grrrls,https://www.twoevils.org/rollergirls/,
2,"""A"" Cup Annihilator",36A,2010-10-13,Billings Roller Derby,https://www.twoevils.org/rollergirls/,
3,"""Bobby"" Val Halen",1984,2011-02-24,Roller Derby Quebec,https://www.twoevils.org/rollergirls/,
4,"""Chupa'clark'bra""",333,2012-01-09,Killa Hurtz Roller Girls,https://www.twoevils.org/rollergirls/,
...,...,...,...,...,...,...
188546,Zulu Mother Smother,360,,,https://rdnation.com/roller-derby-league/1/m-i...,
188547,Zulu Xray,911,,,https://rdnation.com/roller-derby-league/1/rol...,
188548,Zuul,REF,,,https://rdnation.com/roller-derby-league/1/ora...,
188549,Zyklon C,HCN1,,,https://rdnation.com/roller-derby-league/2/cro...,


In [105]:
csv_file = Path("derby_names.csv")
name_df.drop_duplicates().sort_values(by=["Name"]).to_csv(csv_file, index=False)

if "google.colab" in sys.modules:
    files.download(csv_file)


In [106]:
names_only = name_df[["Name"]].drop_duplicates().sort_values(by=["Name"])
names_only.to_csv("derby_names.txt", index=False, header=False)

if "google.colab" in sys.modules:
    files.download("derby_names.txt")


In [113]:
numbers = name_df[["Number"]].copy()
numbers['Number'] = numbers['Number'].astype(str)
numbers = numbers.sort_values(by=['Number']).drop_duplicates()
numbers.to_csv("derby_numbers.txt", index=False, header=False)

if "google.colab" in sys.modules:
    files.download("derby_numbers.txt")


In [109]:
names_numbers = (
    name_df[~name_df["Number"].isna()][["Name", "Number"]]
    .drop_duplicates()
    .sort_values(by=["Name", "Number"])
)
names_numbers.to_csv("derby_names_numbers.tsv", index=False, header=False, sep="\t")

if "google.colab" in sys.modules:
    files.download("derby_names_numbers.tsv")
