Scraping [derby names](https://en.wikipedia.org/wiki/Roller_derby#Derby_names) from publicly-accessible lists

In [12]:
import sys
from pathlib import Path
import pandas as pd
import utils

In [2]:
wftda_df = utils.fetch_wftda()
wftda_df

Unnamed: 0,Name,url
0,5284 Kim Tonkin,https://resources.wftda.org/officiating/roller...
1,9mm Ram-Paige,https://resources.wftda.org/officiating/roller...
2,A. Grue,https://resources.wftda.org/officiating/roller...
3,A’Blazing Grace,https://resources.wftda.org/officiating/roller...
4,Adam Smasher,https://resources.wftda.org/officiating/roller...
...,...,...
613,Yvel Saint Laurent,https://resources.wftda.org/officiating/roller...
614,Zebra 3,https://resources.wftda.org/officiating/roller...
615,Zed,https://resources.wftda.org/officiating/roller...
616,Zero,https://resources.wftda.org/officiating/roller...


In [3]:
drc_df = utils.fetch_drc()
drc_df

Unnamed: 0,Name,Number,League,Country,Registered
0,Sausage Roller,14,Manchester Roller Derby,United Kingdom,"1st January, 1970"
1,James Mean,27,Manchester Roller Derby,United Kingdom,"20th January, 2014"
2,Yvel Saint Laurent,14,Knights of Oldham Roller Derby & Rainy City Ro...,United Kingdom,"20th January, 2014"
3,Pancake,1928,Rainy City Roller Derby,United Kingdom,"20th January, 2014"
4,Robert Quadriguez,101,Crash Test Brummies,United Kingdom,"20th January, 2014"
...,...,...,...,...,...
40581,Dani Phantom,09,,United States,"1st June, 2024"
40582,Smushroom,5783,Coventry roller derby,United Kingdom,"1st June, 2024"
40583,D-apeX,5,Wine Town Rollers,United States,"1st June, 2024"
40584,BrowSin,137,Texoma Roller Derby,United States,"3rd June, 2024"


In [4]:
rdn_df = utils.fetch_rdn()
rdn_df

Error fetching data: <urlopen error [Errno 110] Connection timed out>
Error fetching data: 'NoneType' object is not iterable


In [5]:
twoevils_df = utils.fetch_twoevils()
twoevils_df

Error fetching data: <urlopen error [Errno 110] Connection timed out>


In [6]:
# Fetch all names beginning with a letter from RollerDerbyRoster
rdr_df = utils.fetch_rdr()
rdr_df

Error fetching data: 'int' object has no attribute 'get_text'


In [13]:
name_df = pd.concat([twoevils_df, drc_df, rdr_df, wftda_df, rdn_df], ignore_index=True)
# remove parenthetical phrases from names - eg "(cleared)"
name_df["Name"] = name_df["Name"].str.replace(r"\([^()]*\)", "").str.strip()
# remove names with only one character
name_df = name_df.loc[name_df["Name"].str.len() > 1]
name_df

Unnamed: 0,Name,Number,League,Country,Registered,url
0,Sausage Roller,14,Manchester Roller Derby,United Kingdom,"1st January, 1970",
1,James Mean,27,Manchester Roller Derby,United Kingdom,"20th January, 2014",
2,Yvel Saint Laurent,14,Knights of Oldham Roller Derby & Rainy City Ro...,United Kingdom,"20th January, 2014",
3,Pancake,1928,Rainy City Roller Derby,United Kingdom,"20th January, 2014",
4,Robert Quadriguez,101,Crash Test Brummies,United Kingdom,"20th January, 2014",
...,...,...,...,...,...,...
41196,Yvel Saint Laurent,,,,,https://resources.wftda.org/officiating/roller...
41197,Zebra 3,,,,,https://resources.wftda.org/officiating/roller...
41198,Zed,,,,,https://resources.wftda.org/officiating/roller...
41199,Zero,,,,,https://resources.wftda.org/officiating/roller...


In [14]:
name_df["Number"].value_counts()

Number
13       702
11       409
42       401
7        393
22       381
        ... 
1853       1
11:58      1
4H         1
T1D        1
5783       1
Name: count, Length: 5933, dtype: int64

In [15]:
csv_file = Path("derby_names.csv")
name_df.drop_duplicates().sort_values(by=["Name"]).to_csv(csv_file, index=False)

if "google.colab" in sys.modules:
    files.download(csv_file)

In [16]:
names_only = name_df[["Name"]].drop_duplicates().sort_values(by=["Name"])
names_only.to_csv("derby_names.txt", index=False, header=False)

if "google.colab" in sys.modules:
    files.download("derby_names.txt")

In [17]:
numbers = name_df[["Number"]].copy()
numbers["Number"] = numbers["Number"].astype(str)
numbers = numbers.sort_values(by=["Number"]).drop_duplicates()
numbers.to_csv("derby_numbers.txt", index=False, header=False)

if "google.colab" in sys.modules:
    files.download("derby_numbers.txt")

In [18]:
names_numbers = (
    name_df[~name_df["Number"].isna()][["Name", "Number"]]
    .drop_duplicates()
    .sort_values(by=["Name", "Number"])
)
names_numbers.to_csv("derby_names_numbers.tsv", index=False, header=False, sep="\t")

if "google.colab" in sys.modules:
    files.download("derby_names_numbers.tsv")