<a href="https://colab.research.google.com/github/bdunnette/derby_name_generator/blob/notebook/derby_names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on Max Woolf's notebook: https://drive.google.com/file/d/1mMKGnVxirJnqDViH7BDJxFqWrsXlPSoK/view?usp=sharing

Inspired by Janelle Shane's blog post: http://aiweirdness.com/post/174466734677/neural-network-generated-roller-derby-names

In [0]:
!pip install tensorflow textgenrnn tensorflowjs
from google.colab import files
from textgenrnn import textgenrnn
training_file = "derby_names.txt"
model_name = 'derbynames' 

Collecting tensorflowjs
  Downloading https://files.pythonhosted.org/packages/2f/ea/5ef7904c720f22c78e9efbd7aa7473ac08c21f09b495961c93ca374e3423/tensorflowjs-1.2.2.1-py3-none-any.whl
Collecting tensorflow-hub==0.3.0 (from tensorflowjs)
[?25l  Downloading https://files.pythonhosted.org/packages/9e/f0/3a3ced04c8359e562f1b91918d9bde797c8a916fcfeddc8dc5d673d1be20/tensorflow_hub-0.3.0-py2.py3-none-any.whl (73kB)
[K     |████████████████████████████████| 81kB 7.2MB/s 
[31mERROR: tensorflowjs 1.2.2.1 has requirement six==1.11.0, but you'll have six 1.12.0 which is incompatible.[0m
[31mERROR: tensorflowjs 1.2.2.1 has requirement tensorflow==1.14.0, but you'll have tensorflow 1.14.0rc1 which is incompatible.[0m
Installing collected packages: tensorflow-hub, tensorflowjs
  Found existing installation: tensorflow-hub 0.4.0
    Uninstalling tensorflow-hub-0.4.0:
      Successfully uninstalled tensorflow-hub-0.4.0
Successfully installed tensorflow-hub-0.3.0 tensorflowjs-1.2.2.1


Using TensorFlow backend.


In [26]:
from bs4 import BeautifulSoup
import requests
import string
import re
import random

name_set = set()
session = requests.Session()

def clean_name(text):
#   strip whitespace and remove parentheticals
  return re.sub(r" ?\([^)]+\)", "", text.strip())
  
url1 = "https://www.twoevils.org/rollergirls/"
print("Getting names from {}".format(url1))
r1 = session.get(url1)
d1 = r1.text
soup1 = BeautifulSoup(d1, "lxml")
rows1 = soup1.find_all('tr', {'class':['trc1', 'trc2']})

for idx, row in enumerate(rows1):
    td = row.find('td')
    name = clean_name(td.get_text())
    name_set.add(name)
    
print("Downloaded {} names".format(len(name_set)))

url2 = "http://www.derbyrollcall.com/everyone"
print("Getting names from {}".format(url2))
r2 = session.get(url2)
d2 = r2.text
soup2 = BeautifulSoup(d2, "lxml")
rows2 = soup2.find_all('td', {'class':'name'})

for idx, td in enumerate(rows2):
    name = clean_name(td.get_text())
    name_set.add(name)

print("Downloaded {} names".format(len(name_set)))
    
initial_letters = string.ascii_uppercase
# Loop through initial letters (A-Z)
for letter in initial_letters:
  url3 = "https://rollerderbyroster.com/view-names/?ini={}".format(letter)
  print("Getting names from {}".format(url3))
  r3 = session.get(url3)
  d3 = r3.text
  soup3 = BeautifulSoup(d3, "lxml")
  
  rows3 = soup3.find_all('ul')
  # Use only last unordered list - this is where names are!
  for idx, li in enumerate(rows3[-1]):
    # Name should be the text of the link within the list item
    name = clean_name(li.find('a').get_text())
#     print(name)
    name_set.add(name)
  print("Downloaded {} names".format(len(name_set)))    
    
with open(training_file,"w") as names_file:
    name_list = list(name_set)
    print("Writing {} names to {}".format(len(name_list),training_file))
#     name_list.sort()
    random.shuffle(name_list)
    names_file.writelines("%s\n" % n for n in name_list)
#     files.download(training_file)

Getting names from https://www.twoevils.org/rollergirls/
Downloaded 40509 names
Getting names from http://www.derbyrollcall.com/everyone
Downloaded 69258 names
Getting names from https://rollerderbyroster.com/view-names/?ini=A
Downloaded 69364 names
Getting names from https://rollerderbyroster.com/view-names/?ini=B
Downloaded 69601 names
Getting names from https://rollerderbyroster.com/view-names/?ini=C
Downloaded 69756 names
Getting names from https://rollerderbyroster.com/view-names/?ini=D
Downloaded 69972 names
Getting names from https://rollerderbyroster.com/view-names/?ini=E
Downloaded 70024 names
Getting names from https://rollerderbyroster.com/view-names/?ini=F
Downloaded 70124 names
Getting names from https://rollerderbyroster.com/view-names/?ini=G
Downloaded 70200 names
Getting names from https://rollerderbyroster.com/view-names/?ini=H
Downloaded 70331 names
Getting names from https://rollerderbyroster.com/view-names/?ini=I
Downloaded 70382 names
Getting names from https://rol

In [0]:
model_cfg = {
    'word_level': False,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 128,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 3,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': True,   # consider text both forwards and backward, can give a training boost
    'max_length': 20,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 10000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': False,   # set to True if each text has its own line in the source file
    'num_epochs': 20,   # set higher to train the model for longer
    'gen_epochs': 5,   # generates sample text from model after given number of epochs
    'train_size': 1.0,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.2,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': False,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': False   # set to True if file is a CSV exported from Excel/BigQuery/pandas
}

In [0]:
textgen = textgenrnn(name=model_name)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

train_function(
    file_path=training_file,
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=1024,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=100,
    word_level=model_cfg['word_level'])

Training new model w/ 3-layer, 128-cell Bidirectional LSTMs
Training on 984,669 character sequences.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
####################
Temperature: 0.2
####################
 Mangler
Trixie Bomb
Silver Slammer
Bashing Bang Bang
Baby Bang Bang
Shank Streak
Torture Storm
Slam Dead
Star Machine
Suzie Slammer
Cherry Bitch
Mary Mary Mayhem
Mary Mary Slammer
Delia Deadly
Sassy Streak
Darth Storm
Roller Roller
The Bomb
Danger Stone
Bella Bang Bang
Slam Danger
Dangerous Bang Ba

a Bang Bang
Bambi Bullet
Mary Manson
Bang Bang Bang
Malice Angel
Shell Bang Bang
Bang Bang Bang
Shank Star
Slam Danger
Shelly Striper
Sally Bang Bang
Sally Storm
Shank Storm
Scarlet Block
Sugar Streak
Samming Slammer
Slam Candy
Shell Butter
Suzy Stones
Tara Bomber
Slam Danger
The Bang Bang
Slam Dang

ry Steel
Sin Destroyer
Slam Steel
Tara Bang Bang
Mary Star
Lil Miss Slammer
Bang Bang Bang
Buster Bomb
Harley Scream
Scarlet Striper
Big Bomb
Judge Slammer
Gold Stripes
Sassy Slamme

In [0]:
files.download('{}_vocab.json'.format(model_name))
files.download('{}_config.json'.format(model_name))
files.download('{}_weights.hdf5'.format(model_name))

In [0]:
import tensorflowjs as tfjs
model_export_dir = "{}_tfjs".format(model_name)
tfjs.converters.save_keras_model(textgen.model, model_export_dir)

In [0]:
from datetime import datetime
import random

prefix = None   # if you want each generated text to start with a given seed text

temperature = []
for i in range(4):
    temp = round(random.random(), 1)
    temperature.append(temp)   

if train_cfg['line_delimited']:
  n = 1000
  max_gen_length = 60 if model_cfg['word_level'] else 300
else:
  n = 10
  max_gen_length = 2000 if model_cfg['word_level'] else 10000
  
generated_names = textgen.generate(n=n, temperature=temperature, return_as_list=True)[0].split('\n')

new_names = [n for n in generated_names if n not in name_list]
print(new_names)

timestring = datetime.now().strftime('%Y%m%d_%H%M%S')
gen_file = '{}_gentext_{}.txt'.format(model_name, timestring)

with open(gen_file, 'w') as f:
  f.writelines("%s\n" % n for n in new_names)


# textgen.generate_to_file(gen_file,
#                          temperature=temperature,
#                          prefix=prefix,
#                          n=n,
#                          max_gen_length=max_gen_length)

files.download(gen_file)