Generating [derby names](https://en.wikipedia.org/wiki/Roller_derby#Derby_names) from publicly-accessible lists

Adapted from Max Woolf's notebook: https://drive.google.com/file/d/1mMKGnVxirJnqDViH7BDJxFqWrsXlPSoK/view?usp=sharing

Inspired by Janelle Shane's blog post: http://aiweirdness.com/post/174466734677/neural-network-generated-roller-derby-names

In [13]:
import string
import random
from datetime import datetime
from pathlib import Path
import concurrent.futures

from google.colab import files
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [14]:
training_file = Path("derby_names.txt")
model_name = 'derbynames' 

In [15]:
session = requests.Session()

In [None]:
url = "http://www.derbyrollcall.com/everyone"
print("Downloading names from %s" % url)
drc_df = pd.concat(pd.read_html(url))
drc_df = drc_df.rename(columns={"#":"Number"})
drc_df['url'] = url
drc_df

Downloading names from http://www.derbyrollcall.com/everyone


In [None]:
url = "https://www.twoevils.org/rollergirls/"
print("Downloading names from %s" % url)
twoevils_df = pd.read_html(url,skiprows=1)[0]
twoevils_df.columns = [h.replace('Skater','').strip() for h in twoevils_df.iloc[0]]
twoevils_df = twoevils_df.rename(columns={'Date Added':'Registered'})
twoevils_df = twoevils_df.iloc[1:-1 , :].dropna(how='all')
twoevils_df['url'] = url
twoevils_df

In [None]:
initial_letters = string.ascii_uppercase + string.digits + string.punctuation
rdr_df = pd.DataFrame()

def get_page_names(initial_letter, timeout=30):
    temp_names = []
    url = "https://rollerderbyroster.com/view-names/?ini={}".format(letter)
    print("Downloading names from {}".format(url))
    try:
      response = session.get(url=url, timeout=timeout)
      r = session.get(url)
      soup = BeautifulSoup(r.text, "lxml")
      rows = soup.find_all('ul')
      # Use only last unordered list - this is where names are!
      for idx, li in enumerate(rows[-1]):
        # Name should be the text of the link within the list item
        name = li.find('a').get_text()
        temp_names.append(name)
    except requests.Timeout:
      print("Timeout!")
      pass
    return temp_names

for letter in initial_letters:
  try:
    temp_names = get_page_names(initial_letter=letter)
    temp_df = pd.DataFrame(data={'Name':temp_names, 'url':url})
    rdr_df.append(temp_df)
  except Exception as e:
    print(e)
    pass

rdr_df

In [None]:
url='https://resources.wftda.org/officiating/roller-derby-certification-program-for-officials/roster-of-certified-officials/'
print("Downloading names from {}".format(url))
session.headers.update({'User-Agent':'Mozilla/5.0'})
r = session.get(url)
soup = BeautifulSoup(r.text, "lxml")
rows = soup.find_all('h5')
urls = [r.find('a')['href'] for r in rows]
names = [r.find('a').get_text() for r in rows]
wftda_df = pd.DataFrame({'Name':names,'url':urls})
wftda_df

In [None]:
name_df = pd.concat([twoevils_df,drc_df,rdr_df,wftda_df],ignore_index=True)
# remove parenthetical phrases from names - eg "(cleared)"
name_df['Name'] = name_df['Name'].str.replace(r"\([^()]*\)", "").str.strip()
name_df = name_df.loc[name_df['Name'].str.len()>1]
name_df

In [None]:
csv_file=Path('derby_names.csv')
name_df.drop_duplicates().sort_values(by=['Name']).to_csv(csv_file,index=False)
files.download(csv_file)

In [None]:
names_only = name_df[['Name']].drop_duplicates().sort_values(by=['Name'])
names_only.to_csv('derby_names.txt',index=False,header=False)
files.download('derby_names.txt')

In [None]:
numbers = name_df[['Number']].drop_duplicates().sort_values(by=['Number'])
numbers.to_csv('derby_numbers.txt',index=False,header=False)
files.download('derby_numbers.txt')

In [None]:
names_numbers = name_df[~name_df['Number'].isna()][['Name','Number']].drop_duplicates().sort_values(by=['Name','Number'])
names_numbers.to_csv('derby_names_numbers.tsv',index=False,header=False,sep='\t')
files.download('derby_names_numbers.tsv')