<a href="https://colab.research.google.com/github/bdunnette/derby_name_generator/blob/master/derby_names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on Max Woolf's notebook: https://drive.google.com/file/d/1mMKGnVxirJnqDViH7BDJxFqWrsXlPSoK/view?usp=sharing

Inspired by Janelle Shane's blog post: http://aiweirdness.com/post/174466734677/neural-network-generated-roller-derby-names

In [0]:
!pip install -q textgenrnn tensorflowjs
from google.colab import files
from textgenrnn import textgenrnn
training_file = "derby_names.txt"
model_name = 'derbynames' 

In [0]:
model_cfg = {
    'word_level': False,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 128,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 3,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': False,   # consider text both forwards and backward, can give a training boost
    'max_length': 20,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 10000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': False,   # set to True if each text has its own line in the source file
    'num_epochs': 20,   # set higher to train the model for longer
    'gen_epochs': 5,   # generates sample text from model after given number of epochs
    'train_size': 0.8,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.2,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': False,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': False   # set to True if file is a CSV exported from Excel/BigQuery/pandas
}

In [0]:
from bs4 import BeautifulSoup
import requests

name_set = set()
session = requests.Session()

r1 = session.get("https://www.twoevils.org/rollergirls/")
d1 = r1.text
soup1 = BeautifulSoup(d1, "lxml")
rows1 = soup1.find_all('tr', {'class':['trc1', 'trc2']})

for idx, row in enumerate(rows1):
    td = row.find('td')
    name_set.add(td.get_text())

r2 = session.get("http://www.derbyrollcall.com/everyone")
d2 = r2.text
soup2 = BeautifulSoup(d2, "lxml")
rows2 = soup2.find_all('td', {'class':'name'})

for idx, td in enumerate(rows2):
    name_set.add(td.get_text())
    
with open(training_file,"w") as names_file:
    name_list = list(name_set)
    names_file.writelines("%s\n" % n for n in name_list)

In [0]:
textgen = textgenrnn(name=model_name)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

train_function(
    file_path=training_file,
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=1024,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=100,
    word_level=model_cfg['word_level'])

In [0]:
# this temperature schedule cycles between 1 very unexpected token, 1 unexpected token, 2 expected tokens, repeat.
# changing the temperature schedule can result in wildly different output!
from datetime import datetime
temperature = [1.0, 0.5, 0.5, 0.2]   
prefix = None   # if you want each generated text to start with a given seed text

if train_cfg['line_delimited']:
  n = 1000
  max_gen_length = 60 if model_cfg['word_level'] else 300
else:
  n = 10
  max_gen_length = 2000 if model_cfg['word_level'] else 10000
  
generated_names = textgen.generate(n=n, temperature=temperature, return_as_list=True)[0].split('\n')

new_names = [n for n in generated_names if n not in name_list]
print(new_names)

timestring = datetime.now().strftime('%Y%m%d_%H%M%S')
gen_file = '{}_gentext_{}.txt'.format(model_name, timestring)

with open(gen_file, 'w') as f:
  f.writelines("%s\n" % n for n in new_names)


# textgen.generate_to_file(gen_file,
#                          temperature=temperature,
#                          prefix=prefix,
#                          n=n,
#                          max_gen_length=max_gen_length)

files.download(gen_file)

In [0]:
import tensorflowjs as tfjs
model_export_dir = "{}_tfjs".format(model_name)
tfjs.converters.save_keras_model(textgen.model, model_export_dir)
files.download('{}_vocab.json'.format(model_name))
files.download('{}_config.json'.format(model_name))
files.download('{}_weights.hdf5'.format(model_name))