Scraping [derby names](https://en.wikipedia.org/wiki/Roller_derby#Derby_names) from publicly-accessible lists

In [17]:
import sys
!{sys.executable} -m pip install -qq pandas requests bs4 lxml advertools html5lib tqdm

In [18]:
import string
from pathlib import Path

from bs4 import BeautifulSoup
import requests

import pandas as pd
import advertools as adv
from tqdm import tqdm

# from tqdm.auto import tqdm

In [19]:
if "google.colab" in sys.modules:
    from google.colab import files
    from sklearn.utils import shuffle

In [20]:
session = requests.Session()

In [21]:
# Download the WFTDA list of certified officials
try:
    wftda_df = pd.DataFrame()
    url = "https://resources.wftda.org/officiating/roller-derby-certification-program-for-officials/roster-of-certified-officials/"
    print("Downloading names from {}".format(url))
    
    r = session.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    rows = soup.find_all("h5")
    urls = [r.find("a")["href"] for r in rows]
    names = [r.find("a").get_text() for r in rows]
    wftda_df = pd.DataFrame({"Name": names, "url": urls})
except Exception as e:
    print(e)

Downloading names from https://resources.wftda.org/officiating/roller-derby-certification-program-for-officials/roster-of-certified-officials/


In [22]:
if len(wftda_df) == 0:
    wftda_df = pd.read_csv("wftda.csv")
else:
    wftda_df.to_csv("wftda.csv", index=False)
wftda_df

Unnamed: 0,Name,url
0,9mm Ram-Paige,https://resources.wftda.org/officiating/roller...
1,A. Grue,https://resources.wftda.org/officiating/roller...
2,A’Blazing Grace,https://resources.wftda.org/officiating/roller...
3,Adam Smasher,https://resources.wftda.org/officiating/roller...
4,Adam Splitter,https://resources.wftda.org/officiating/roller...
...,...,...
570,Yu Cypher,https://resources.wftda.org/officiating/roller...
571,Yvel Saint Laurent,https://resources.wftda.org/officiating/roller...
572,Zebra 3,https://resources.wftda.org/officiating/roller...
573,Zero,https://resources.wftda.org/officiating/roller...


In [23]:
# Download TwoEvils list of skaters
url = "https://www.twoevils.org/rollergirls/"
print("Downloading names from %s" % url)
twoevils_df = pd.DataFrame()

try:
    twoevils_df = pd.read_html(io=url, skiprows=1)[0]
    twoevils_df.columns = [h.replace("Skater", "").strip() for h in twoevils_df.iloc[0]]
    twoevils_df = twoevils_df.rename(columns={"Date Added": "Registered"})
    twoevils_df = twoevils_df.iloc[1:-1, :].dropna(how="all")
    twoevils_df["url"] = url
except Exception as e:
    print(e)

Downloading names from https://www.twoevils.org/rollergirls/
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>


In [24]:
if len(twoevils_df) == 0:
    twoevils_df = pd.read_csv("twoevils.csv")
else:
    twoevils_df.to_csv("twoevils.csv", index=False)
twoevils_df

Unnamed: 0,Name,Number,Registered,League,url
0,!(ED,REF,2012-02-20,Santiago Roller Derby,https://www.twoevils.org/rollergirls/
1,!Wolfespit,16,2011-02-01,Chilli Padi Derby Grrrls,https://www.twoevils.org/rollergirls/
2,"""A"" Cup Annihilator",36A,2010-10-13,Billings Roller Derby,https://www.twoevils.org/rollergirls/
3,"""Bobby"" Val Halen",1984,2011-02-24,Roller Derby Quebec,https://www.twoevils.org/rollergirls/
4,"""Chupa'clark'bra""",333,2012-01-09,Killa Hurtz Roller Girls,https://www.twoevils.org/rollergirls/
...,...,...,...,...,...
40536,Zuul,REF,2008-04-13,Orange County Roller Girls,https://www.twoevils.org/rollergirls/
40537,Zuzi Power,33,2012-04-20,White Horse Warriors,https://www.twoevils.org/rollergirls/
40538,Zwen Garden,E13,2011-03-24,Deja Vu Roller Derby,https://www.twoevils.org/rollergirls/
40539,Zyklon C,HCN1,2010-03-08,Croydon Roller Derby,https://www.twoevils.org/rollergirls/


In [25]:
# Download list from Derby Roll Call
try:
    drc_url =  "http://www.derbyrollcall.com/everyone"
    drc_df = pd.read_html(io=drc_url)[0].rename(columns={"#": "Number"}).dropna(subset=["Name"])
    drc_df["url"] = drc_url
except Exception as e:
    print(e)

In [26]:
if len(drc_df) == 0:
    drc_df = pd.read_csv(filepath_or_buffer="derbyrollcall.csv")
else:
    drc_df.to_csv(path_or_buf="derbyrollcall.csv", index=False)
drc_df

Unnamed: 0,Name,Number,League,Country,Registered,url
0,Sausage Roller,14,Manchester Roller Derby,United Kingdom,"1st January, 1970",http://www.derbyrollcall.com/everyone
1,James Mean,27,Manchester Roller Derby,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
2,Yvel Saint Laurent,14,Knights of Oldham Roller Derby & Rainy City Ro...,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
3,Pancake,1928,Rainy City Roller Derby,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
4,Robert Quadriguez,101,Crash Test Brummies,United Kingdom,"20th January, 2014",http://www.derbyrollcall.com/everyone
...,...,...,...,...,...,...
40581,Dani Phantom,09,,United States,"1st June, 2024",http://www.derbyrollcall.com/everyone
40582,Smushroom,5783,Coventry roller derby,United Kingdom,"1st June, 2024",http://www.derbyrollcall.com/everyone
40583,D-apeX,5,Wine Town Rollers,United States,"1st June, 2024",http://www.derbyrollcall.com/everyone
40584,BrowSin,137,Texoma Roller Derby,United States,"3rd June, 2024",http://www.derbyrollcall.com/everyone


In [27]:
# Fetch all names beginning with a letter from RollerDerbyRoster
def get_page_names(initial_letter, timeout=30):
    temp_names = []
    url = "https://rollerderbyroster.com/view-names/?ini={}".format(letter)
    # print("Downloading names from {}".format(url))
    try:
        r = session.get(url)
        soup = BeautifulSoup(r.text, "lxml")
        rows = soup.find_all("ul")
        # Use only last unordered list - this is where names are!
        for _, li in enumerate(rows[-1]):
            # Name should be the text of the link within the list item
            name = li.find("a").get_text()
            temp_names.append(name)
    except requests.Timeout:
        print("Timeout!")
        pass
    return temp_names

In [28]:
# Get all names from RollerDerbyRoster
initial_letters = string.ascii_letters + string.digits + string.punctuation
rdr_df = pd.DataFrame()

pbar = tqdm(iterable=initial_letters)
for letter in pbar:
    try:
        pbar.set_description(desc=f"{len(rdr_df)} names found - Processing {letter}")
        temp_names = get_page_names(initial_letter=letter)
        temp_df = pd.DataFrame(data={"Name": temp_names, "url": url})
        rdr_df = pd.concat(objs=[rdr_df, temp_df])
    except Exception as e:
        print(e)
        pass

0 names found - Processing a:   0%|          | 0/94 [00:00<?, ?it/s]

0 names found - Processing b:   1%|          | 1/94 [00:10<16:24, 10.58s/it]

'int' object has no attribute 'get_text'


0 names found - Processing c:   2%|▏         | 2/94 [00:23<18:19, 11.95s/it]

'int' object has no attribute 'get_text'


0 names found - Processing d:   3%|▎         | 3/94 [00:32<16:05, 10.61s/it]

'int' object has no attribute 'get_text'


0 names found - Processing e:   4%|▍         | 4/94 [00:44<16:29, 11.00s/it]

'int' object has no attribute 'get_text'


0 names found - Processing f:   5%|▌         | 5/94 [00:50<13:48,  9.30s/it]

'int' object has no attribute 'get_text'


0 names found - Processing g:   6%|▋         | 6/94 [00:56<11:58,  8.17s/it]

'int' object has no attribute 'get_text'


0 names found - Processing h:   7%|▋         | 7/94 [01:00<09:55,  6.84s/it]

'int' object has no attribute 'get_text'


0 names found - Processing i:   9%|▊         | 8/94 [01:06<09:21,  6.52s/it]

'int' object has no attribute 'get_text'


0 names found - Processing j:  10%|▉         | 9/94 [01:11<08:35,  6.06s/it]

'int' object has no attribute 'get_text'


0 names found - Processing k:  11%|█         | 10/94 [01:17<08:33,  6.11s/it]

'int' object has no attribute 'get_text'


0 names found - Processing l:  12%|█▏        | 11/94 [01:22<08:03,  5.83s/it]

'int' object has no attribute 'get_text'


0 names found - Processing m:  13%|█▎        | 12/94 [01:30<08:44,  6.39s/it]

'int' object has no attribute 'get_text'


0 names found - Processing n:  14%|█▍        | 13/94 [01:52<14:59, 11.10s/it]

'int' object has no attribute 'get_text'


0 names found - Processing o:  15%|█▍        | 14/94 [01:56<11:59,  8.99s/it]

'int' object has no attribute 'get_text'


0 names found - Processing p:  16%|█▌        | 15/94 [01:58<09:10,  6.97s/it]

'int' object has no attribute 'get_text'


0 names found - Processing q:  17%|█▋        | 16/94 [02:04<08:33,  6.59s/it]

'int' object has no attribute 'get_text'


0 names found - Processing r:  18%|█▊        | 17/94 [02:07<07:15,  5.65s/it]

'int' object has no attribute 'get_text'


0 names found - Processing s:  19%|█▉        | 18/94 [02:14<07:33,  5.97s/it]

'int' object has no attribute 'get_text'


0 names found - Processing t:  20%|██        | 19/94 [02:32<12:05,  9.68s/it]

'int' object has no attribute 'get_text'


0 names found - Processing u:  21%|██▏       | 20/94 [02:39<10:45,  8.73s/it]

'int' object has no attribute 'get_text'


0 names found - Processing v:  22%|██▏       | 21/94 [02:41<08:07,  6.68s/it]

'int' object has no attribute 'get_text'


0 names found - Processing v:  22%|██▏       | 21/94 [02:43<09:30,  7.81s/it]


KeyboardInterrupt: 

In [None]:
# If no data was found, load from CSV
if len(rdr_df) == 0:
    rdr_df = pd.read_csv("rollerderbyroster.csv")
else:
    rdr_df.to_csv("rollerderbyroster.csv", index=False)
rdr_df

Unnamed: 0,Name,url
0,A B Old Tricksee,http://www.derbyrollcall.com/everyone
1,A Blonde with No Name,http://www.derbyrollcall.com/everyone
2,A Bomb,http://www.derbyrollcall.com/everyone
3,A Bout To Snap,http://www.derbyrollcall.com/everyone
4,A Boy Named Rob,http://www.derbyrollcall.com/everyone
...,...,...
3,.50 Cal,http://www.derbyrollcall.com/everyone
4,.50 Caliber Killer,http://www.derbyrollcall.com/everyone
5,.50 Kal Kitten,http://www.derbyrollcall.com/everyone
6,.CC Rip Jaw,http://www.derbyrollcall.com/everyone


In [None]:
# Get RDNation sitemap
rdn_sitemap_url = "https://rdnation.com/sitemap.xml"
rdn_sitemaps = adv.sitemap_to_df(rdn_sitemap_url)
# League pages have a specific URL structure
rdn_sitemaps["is_league"] = (
    rdn_sitemaps["loc"].str.contains("roller-derby-league/").fillna(False)
)

In [None]:
# Extract league URLs
rdn_league_urls = sorted(
    rdn_sitemaps[
        rdn_sitemaps["is_league"]
        & (
            rdn_sitemaps["loc"].str.contains("/2/")
            | rdn_sitemaps["loc"].str.contains("/1/")
        )
    ]["loc"].tolist()
)

In [None]:
rdn_df = pd.DataFrame()


In [None]:
# Loop through league pages and extract derby names
pbar = tqdm(rdn_league_urls)

for url in pbar:
    try:
        pbar.set_description(f"{len(rdn_df)} names found")
        league_df = pd.concat(pd.read_html(url), ignore_index=True)
        league_df.rename(columns={"Derby Name": "Name"}, inplace=True)
        league_df["url"] = url
        rdn_df = pd.concat([rdn_df, league_df])
    except ValueError as e:
        continue

In [None]:
# If no data was found, load from CSV
if len(rdn_df) == 0:
    rdn_df = pd.read_csv("rdnation.csv")
else:
    rdn_df.dropna(how="all", axis="columns", inplace=True)
    rdn_df.drop_duplicates(subset=["Name", "Number"], inplace=True)
    rdn_df.sort_values(by="Name", inplace=True)
    rdn_df.to_csv("rdnation.csv", index=False)
rdn_df

Unnamed: 0,Name,Number,url
0,!Wolfespit,16,https://rdnation.com/roller-derby-league/2/chi...
1,$3 bill,STAFF,https://rdnation.com/roller-derby-league/1/car...
2,'Lil Diablo,72,https://rdnation.com/roller-derby-league/1/ham...
3,.357 Maggie,6040,https://rdnation.com/roller-derby-league/1/cha...
4,.38 SpeShell,38,https://rdnation.com/roller-derby-league/1/lav...
...,...,...,...
26004,Zulu Mother Smother,360,https://rdnation.com/roller-derby-league/1/m-i...
26005,Zulu Xray,911,https://rdnation.com/roller-derby-league/1/rol...
26006,Zuul,REF,https://rdnation.com/roller-derby-league/1/ora...
26007,Zyklon C,HCN1,https://rdnation.com/roller-derby-league/2/cro...


In [None]:
name_df = pd.concat([twoevils_df, drc_df, rdr_df, wftda_df, rdn_df], ignore_index=True)
# remove parenthetical phrases from names - eg "(cleared)"
name_df["Name"] = name_df["Name"].str.replace(r"\([^()]*\)", "").str.strip()
# remove names with only one character
name_df = name_df.loc[name_df["Name"].str.len() > 1]
name_df

  name_df["Name"] = name_df["Name"].str.replace(r"\([^()]*\)", "").str.strip()


Unnamed: 0,Name,Number,Registered,League,url,Country
0,!(ED,REF,2012-02-20,Santiago Roller Derby,https://www.twoevils.org/rollergirls/,
1,!Wolfespit,16,2011-02-01,Chilli Padi Derby Grrrls,https://www.twoevils.org/rollergirls/,
2,"""A"" Cup Annihilator",36A,2010-10-13,Billings Roller Derby,https://www.twoevils.org/rollergirls/,
3,"""Bobby"" Val Halen",1984,2011-02-24,Roller Derby Quebec,https://www.twoevils.org/rollergirls/,
4,"""Chupa'clark'bra""",333,2012-01-09,Killa Hurtz Roller Girls,https://www.twoevils.org/rollergirls/,
...,...,...,...,...,...,...
189678,Zulu Mother Smother,360,,,https://rdnation.com/roller-derby-league/1/m-i...,
189679,Zulu Xray,911,,,https://rdnation.com/roller-derby-league/1/rol...,
189680,Zuul,REF,,,https://rdnation.com/roller-derby-league/1/ora...,
189681,Zyklon C,HCN1,,,https://rdnation.com/roller-derby-league/2/cro...,


In [None]:
name_df["Number"].value_counts()

REF       2718
13        1740
11        1102
7         1039
22         980
          ... 
800cc        1
G40          1
8 bit        1
0012         1
128ï¿½       1
Name: Number, Length: 11496, dtype: int64

In [None]:
csv_file = Path("derby_names.csv")
name_df.drop_duplicates().sort_values(by=["Name"]).to_csv(csv_file, index=False)

if "google.colab" in sys.modules:
    files.download(csv_file)

In [None]:
names_only = name_df[["Name"]].drop_duplicates().sort_values(by=["Name"])
names_only.to_csv("derby_names.txt", index=False, header=False)

if "google.colab" in sys.modules:
    files.download("derby_names.txt")

In [None]:
numbers = name_df[["Number"]].copy()
numbers["Number"] = numbers["Number"].astype(str)
numbers = numbers.sort_values(by=["Number"]).drop_duplicates()
numbers.to_csv("derby_numbers.txt", index=False, header=False)

if "google.colab" in sys.modules:
    files.download("derby_numbers.txt")

In [None]:
names_numbers = (
    name_df[~name_df["Number"].isna()][["Name", "Number"]]
    .drop_duplicates()
    .sort_values(by=["Name", "Number"])
)
names_numbers.to_csv("derby_names_numbers.tsv", index=False, header=False, sep="\t")

if "google.colab" in sys.modules:
    files.download("derby_names_numbers.tsv")