Scraping [derby names](https://en.wikipedia.org/wiki/Roller_derby#Derby_names) from publicly-accessible lists

In [1]:
import sys
!{sys.executable} -m pip install -qq pandas requests bs4 lxml

In [2]:
import string
import random
from datetime import datetime
from pathlib import Path

from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd




In [3]:
if "google.colab" in sys.modules:
    from google.colab import files
    from sklearn.utils import shuffle


In [4]:
training_file = Path("derby_names.txt")
model_name = "derbynames"


In [5]:
session = requests.Session()


In [6]:
try:
    drc_df=pd.DataFrame()
    url = "http://www.derbyrollcall.com/everyone"
    print("Downloading names from %s" % url)
    drc_df = pd.concat(pd.read_html(url))
    drc_df = drc_df.rename(columns={"#": "Number"})
    drc_df["url"] = url
    drc_df.to_csv("derbyrollcall.csv", index=False)
    drc_df
except Exception as e:
    print(e)


Downloading names from http://www.derbyrollcall.com/everyone
HTTP Error 503: Service Unavailable


In [7]:
url = "https://www.twoevils.org/rollergirls/"
print("Downloading names from %s" % url)
twoevils_df = pd.read_html(url, skiprows=1)[0]
twoevils_df.columns = [h.replace("Skater", "").strip() for h in twoevils_df.iloc[0]]
twoevils_df = twoevils_df.rename(columns={"Date Added": "Registered"})
twoevils_df = twoevils_df.iloc[1:-1, :].dropna(how="all")
twoevils_df["url"] = url
twoevils_df.to_csv("twoevils.csv", index=False)
twoevils_df


Downloading names from https://www.twoevils.org/rollergirls/


Unnamed: 0,Name,Number,Registered,League,url
1,!(ED,REF,2012-02-20,Santiago Roller Derby,https://www.twoevils.org/rollergirls/
2,!Wolfespit,16,2011-02-01,Chilli Padi Derby Grrrls,https://www.twoevils.org/rollergirls/
3,"""A"" Cup Annihilator",36A,2010-10-13,Billings Roller Derby,https://www.twoevils.org/rollergirls/
4,"""Bobby"" Val Halen",1984,2011-02-24,Roller Derby Quebec,https://www.twoevils.org/rollergirls/
5,"""Chupa'clark'bra""",333,2012-01-09,Killa Hurtz Roller Girls,https://www.twoevils.org/rollergirls/
...,...,...,...,...,...
40537,Zuul,REF,2008-04-13,Orange County Roller Girls,https://www.twoevils.org/rollergirls/
40538,Zuzi Power,33,2012-04-20,White Horse Warriors,https://www.twoevils.org/rollergirls/
40539,Zwen Garden,E13,2011-03-24,Deja Vu Roller Derby,https://www.twoevils.org/rollergirls/
40540,Zyklon C,HCN1,2010-03-08,Croydon Roller Derby,https://www.twoevils.org/rollergirls/


In [8]:
initial_letters = string.ascii_letters + string.digits + string.punctuation
rdr_df = pd.DataFrame()


def get_page_names(initial_letter, timeout=30):
    temp_names = []
    url = "https://rollerderbyroster.com/view-names/?ini={}".format(letter)
    print("Downloading names from {}".format(url))
    try:
        response = session.get(url=url, timeout=timeout)
        r = session.get(url)
        soup = BeautifulSoup(r.text, "lxml")
        rows = soup.find_all("ul")
        # Use only last unordered list - this is where names are!
        for idx, li in enumerate(rows[-1]):
            # Name should be the text of the link within the list item
            name = li.find("a").get_text()
            temp_names.append(name)
    except requests.Timeout:
        print("Timeout!")
        pass
    return temp_names


for letter in initial_letters:
    try:
        temp_names = get_page_names(initial_letter=letter)
        temp_df = pd.DataFrame(data={"Name": temp_names, "url": url})
        rdr_df = pd.concat([rdr_df, temp_df])
    except Exception as e:
        print(e)
        pass

rdr_df.to_csv("rollerderbyroster.csv", index=False)
rdr_df


Downloading names from https://rollerderbyroster.com/view-names/?ini=a
Downloading names from https://rollerderbyroster.com/view-names/?ini=b
Downloading names from https://rollerderbyroster.com/view-names/?ini=c
Downloading names from https://rollerderbyroster.com/view-names/?ini=d
Downloading names from https://rollerderbyroster.com/view-names/?ini=e
Downloading names from https://rollerderbyroster.com/view-names/?ini=f
Downloading names from https://rollerderbyroster.com/view-names/?ini=g
Downloading names from https://rollerderbyroster.com/view-names/?ini=h
Downloading names from https://rollerderbyroster.com/view-names/?ini=i
Downloading names from https://rollerderbyroster.com/view-names/?ini=j
Downloading names from https://rollerderbyroster.com/view-names/?ini=k
Downloading names from https://rollerderbyroster.com/view-names/?ini=l
Downloading names from https://rollerderbyroster.com/view-names/?ini=m
Downloading names from https://rollerderbyroster.com/view-names/?ini=n
Downlo

Unnamed: 0,Name,url
0,A B Old Tricksee,https://www.twoevils.org/rollergirls/
1,A Blonde with No Name,https://www.twoevils.org/rollergirls/
2,A Bomb,https://www.twoevils.org/rollergirls/
3,A Bout To Snap,https://www.twoevils.org/rollergirls/
4,A Boy Named Rob,https://www.twoevils.org/rollergirls/
...,...,...
3,.50 Cal,https://www.twoevils.org/rollergirls/
4,.50 Caliber Killer,https://www.twoevils.org/rollergirls/
5,.50 Kal Kitten,https://www.twoevils.org/rollergirls/
6,.CC Rip Jaw,https://www.twoevils.org/rollergirls/


In [9]:
url = "https://resources.wftda.org/officiating/roller-derby-certification-program-for-officials/roster-of-certified-officials/"
print("Downloading names from {}".format(url))
session.headers.update({"User-Agent": "Mozilla/5.0"})
r = session.get(url)
soup = BeautifulSoup(r.text, "lxml")
rows = soup.find_all("h5")
urls = [r.find("a")["href"] for r in rows]
names = [r.find("a").get_text() for r in rows]
wftda_df = pd.DataFrame({"Name": names, "url": urls})
wftda_df.to_csv("wftda.csv", index=False)
wftda_df


Downloading names from https://resources.wftda.org/officiating/roller-derby-certification-program-for-officials/roster-of-certified-officials/


Unnamed: 0,Name,url
0,9mm Ram-Paige,https://resources.wftda.org/officiating/roller...
1,A. Grue,https://resources.wftda.org/officiating/roller...
2,A’Blazing Grace,https://resources.wftda.org/officiating/roller...
3,Adam Smasher,https://resources.wftda.org/officiating/roller...
4,Admiral Mayhem,https://resources.wftda.org/officiating/roller...
...,...,...
546,Yu Cypher,https://resources.wftda.org/officiating/roller...
547,Yvel Saint Laurent,https://resources.wftda.org/officiating/roller...
548,Zebra 3,https://resources.wftda.org/officiating/roller...
549,Zero,https://resources.wftda.org/officiating/roller...


In [12]:
name_df = pd.concat([twoevils_df, drc_df, rdr_df, wftda_df], ignore_index=True)
# remove parenthetical phrases from names - eg "(cleared)"
name_df["Name"] = name_df["Name"].str.replace(r"\([^()]*\)", "").str.strip()
name_df = name_df.loc[name_df["Name"].str.len() > 1]
name_df


  name_df["Name"] = name_df["Name"].str.replace(r"\([^()]*\)", "").str.strip()


Unnamed: 0,Name,Number,Registered,League,url
0,!(ED,REF,2012-02-20,Santiago Roller Derby,https://www.twoevils.org/rollergirls/
1,!Wolfespit,16,2011-02-01,Chilli Padi Derby Grrrls,https://www.twoevils.org/rollergirls/
2,"""A"" Cup Annihilator",36A,2010-10-13,Billings Roller Derby,https://www.twoevils.org/rollergirls/
3,"""Bobby"" Val Halen",1984,2011-02-24,Roller Derby Quebec,https://www.twoevils.org/rollergirls/
4,"""Chupa'clark'bra""",333,2012-01-09,Killa Hurtz Roller Girls,https://www.twoevils.org/rollergirls/
...,...,...,...,...,...
123649,Yu Cypher,,,,https://resources.wftda.org/officiating/roller...
123650,Yvel Saint Laurent,,,,https://resources.wftda.org/officiating/roller...
123651,Zebra 3,,,,https://resources.wftda.org/officiating/roller...
123652,Zero,,,,https://resources.wftda.org/officiating/roller...


In [14]:
csv_file = Path("derby_names.csv")
name_df.drop_duplicates().sort_values(by=["Name"]).to_csv(csv_file, index=False)

if 'google.colab' in sys.modules:
    files.download(csv_file)


In [15]:
names_only = name_df[["Name"]].drop_duplicates().sort_values(by=["Name"])
names_only.to_csv("derby_names.txt", index=False, header=False)

if 'google.colab' in sys.modules:
    files.download("derby_names.txt")


In [16]:
numbers = name_df[["Number"]].drop_duplicates().sort_values(by=["Number"])
numbers.to_csv("derby_numbers.txt", index=False, header=False)

if 'google.colab' in sys.modules:
    files.download("derby_numbers.txt")


In [None]:
names_numbers = (
    name_df[~name_df["Number"].isna()][["Name", "Number"]]
    .drop_duplicates()
    .sort_values(by=["Name", "Number"])
)
names_numbers.to_csv("derby_names_numbers.tsv", index=False, header=False, sep="\t")

if 'google.colab' in sys.modules:
    files.download("derby_names_numbers.tsv")
