<a href="https://colab.research.google.com/github/bdunnette/fictional-spork/blob/master/ml5_lstm_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy scipy tensorflow beautifulsoup4 requests



In [0]:
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.contrib import legacy_seq2seq

import numpy as np

class Model():
    def __init__(self, args, training=True):
        self.args = args
        if not training:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn.BasicLSTMCell
        elif args.model == 'nas':
            cell_fn = rnn.NASCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cells = []
        for _ in range(args.num_layers):
            cell = cell_fn(args.rnn_size)
            if training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0):
                cell = rnn.DropoutWrapper(cell,
                                          input_keep_prob=args.input_keep_prob,
                                          output_keep_prob=args.output_keep_prob)
            cells.append(cell)

        self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True)

        self.input_data = tf.placeholder(
            tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(
            tf.int32, [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])

        embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        # dropout beta testing: double check which one should affect next line
        if training and args.output_keep_prob:
            inputs = tf.nn.dropout(inputs, args.output_keep_prob)

        inputs = tf.split(inputs, args.seq_length, 1)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm')
        output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])


        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = legacy_seq2seq.sequence_loss_by_example(
                [self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])])
        with tf.name_scope('cost'):
            self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        # instrument tensorboard
        tf.summary.histogram('logits', self.logits)
        tf.summary.histogram('loss', loss)
        tf.summary.scalar('train_loss', self.cost)

    def sample(self, sess, chars, vocab, num=200, prime='The ', sampling_type=1):
        state = sess.run(self.cell.zero_state(1, tf.float32))
        for char in prime[:-1]:
            x = np.zeros((1, 1))
            x[0, 0] = vocab[char]
            feed = {self.input_data: x, self.initial_state: state}
            [state] = sess.run([self.final_state], feed)

        def weighted_pick(weights):
            t = np.cumsum(weights)
            s = np.sum(weights)
            return(int(np.searchsorted(t, np.random.rand(1)*s)))

        ret = prime
        char = prime[-1]
        for n in range(num):
            x = np.zeros((1, 1))
            x[0, 0] = vocab[char]
            feed = {self.input_data: x, self.initial_state: state}
            [probs, state] = sess.run([self.probs, self.final_state], feed)
            p = probs[0]

            if sampling_type == 0:
                sample = np.argmax(p)
            elif sampling_type == 2:
                if char == ' ':
                    sample = weighted_pick(p)
                else:
                    sample = np.argmax(p)
            else:  # sampling_type == 1 default:
                sample = weighted_pick(p)

            pred = chars[sample]
            ret += pred
            char = pred
        return ret

In [0]:
import codecs
import os
import collections
from six.moves import cPickle
import numpy as np


class TextLoader():
    def __init__(self, data_dir, batch_size, seq_length, encoding='utf-8'):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.encoding = encoding

        input_file = os.path.join(data_dir, "input.txt")
        vocab_file = os.path.join(data_dir, "vocab.pkl")
        tensor_file = os.path.join(data_dir, "data.npy")

        if not (os.path.exists(vocab_file) and os.path.exists(tensor_file)):
            print("Here we go! Reading text file...")
            self.preprocess(input_file, vocab_file, tensor_file)
        else:
            print("Loading preprocessed files")
            self.load_preprocessed(vocab_file, tensor_file)
        self.create_batches()
        self.reset_batch_pointer()

    def preprocess(self, input_file, vocab_file, tensor_file):
        with codecs.open(input_file, "r", encoding=self.encoding) as f:
            data = f.read()
        counter = collections.Counter(data)
        count_pairs = sorted(counter.items(), key=lambda x: -x[1])
        self.chars, _ = zip(*count_pairs)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        with open(vocab_file, 'wb') as f:
            cPickle.dump(self.chars, f)
        self.tensor = np.array(list(map(self.vocab.get, data)))
        np.save(tensor_file, self.tensor)

    def load_preprocessed(self, vocab_file, tensor_file):
        with open(vocab_file, 'rb') as f:
            self.chars = cPickle.load(f)
        self.vocab_size = len(self.chars)
        self.vocab = dict(zip(self.chars, range(len(self.chars))))
        self.tensor = np.load(tensor_file)
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))

    def create_batches(self):
        self.num_batches = int(self.tensor.size / (self.batch_size *
                                                   self.seq_length))

        # When the data (tensor) is too small,
        # let's give them a better error message
        if self.num_batches == 0:
            assert False, "Not enough data. Make seq_length and batch_size small."

        self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length]
        xdata = self.tensor
        ydata = np.copy(self.tensor)
        ydata[:-1] = xdata[1:]
        ydata[-1] = xdata[0]
        self.x_batches = np.split(xdata.reshape(self.batch_size, -1),
                                  self.num_batches, 1)
        self.y_batches = np.split(ydata.reshape(self.batch_size, -1),
                                  self.num_batches, 1)

    def next_batch(self):
        x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
        self.pointer += 1
        return x, y

    def reset_batch_pointer(self):
        self.pointer = 0

In [0]:
"""
ml5js
A script to dump tensorflow checkpoint variables to tensorflow.js.

This script takes a checkpoint file and writes all of the variables in the
checkpoint to a directory.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import json
import os
import re
import string
import tensorflow as tf

OPTIONS = {
  "output_dir": './models/lstm/',
  "remove_variables_regex": '.*Adam.*|.*beta.*',
}
FILENAME_CHARS = string.ascii_letters + string.digits + '_'

def _var_name_to_filename(var_name):
  chars = []
  for c in var_name:
    if c in FILENAME_CHARS:
      chars.append(c)
    elif c == '/':
      chars.append('_')
  return ''.join(chars)

def dump_checkpoints(checkpoint_path, model_path ,vocab, model_name, final_model):
  print('Converting model to ml5js:', model_name, final_model)
  chk_fpath = os.path.expanduser('./{}/{}/{}'.format(checkpoint_path, model_name, final_model))
  reader = tf.train.NewCheckpointReader(chk_fpath)
  var_to_shape_map = reader.get_variable_to_shape_map()
  output_dir = os.path.expanduser(OPTIONS["output_dir"])
  output_dir = './{}/{}'.format(model_path, model_name)
  tf.gfile.MakeDirs(output_dir)
  manifest = {}
  remove_vars_compiled_re = re.compile(OPTIONS["remove_variables_regex"])

  var_filenames_strs = []
  for name in var_to_shape_map:
    if (OPTIONS["remove_variables_regex"] and
        re.match(remove_vars_compiled_re, name)) or name == 'global_step':
      continue
    var_filename = _var_name_to_filename(name)
    manifest[name] = {'filename': var_filename, 'shape': var_to_shape_map[name]}

    # print('Writing variable ' + name + '...')
    tensor = reader.get_tensor(name)
    with open(os.path.join(output_dir, var_filename), 'wb') as f:
      f.write(tensor.tobytes())

    var_filenames_strs.append("\"" + var_filename + "\"")

  # save the vocab
  vocab_fpath = os.path.join(output_dir, 'vocab.json')
  # print('Writing vocab to ' + vocab_fpath)
  with open(vocab_fpath, 'w') as f:
    f.write(json.dumps(vocab, indent=2, sort_keys=True))

  # save the manifest
  manifest_fpath = os.path.join(output_dir, 'manifest.json')
  # print('Writing manifest to ' + manifest_fpath)
  with open(manifest_fpath, 'w') as f:
    f.write(json.dumps(manifest, indent=2, sort_keys=True))
  print('Done! The output model is in', model_path)
  print('Check https://ml5js.org/docs/training-lstm for more information.')


In [5]:
from bs4 import BeautifulSoup
import os
import requests
import string
import re
import random

model_name = "derby_names"
file_name = "{}/input.txt".format(model_name)
training_file = file_name
name_set = set()
session = requests.Session()

def clean_name(text):
#   strip whitespace and remove parentheticals
  return re.sub(r" ?\([^)]+\)", "", text.strip())
  
url1 = "https://www.twoevils.org/rollergirls/"
print("Getting names from {}".format(url1))
r1 = session.get(url1)
d1 = r1.text
soup1 = BeautifulSoup(d1, "lxml")
rows1 = soup1.find_all('tr', {'class':['trc1', 'trc2']})

for idx, row in enumerate(rows1):
    td = row.find('td')
    name = clean_name(td.get_text())
    name_set.add(name)
    
print("Downloaded {} names".format(len(name_set)))

url2 = "http://www.derbyrollcall.com/everyone"
print("Getting names from {}".format(url2))
r2 = session.get(url2)
d2 = r2.text
soup2 = BeautifulSoup(d2, "lxml")
rows2 = soup2.find_all('td', {'class':'name'})

for idx, td in enumerate(rows2):
    name = clean_name(td.get_text())
    name_set.add(name)

print("Downloaded {} names".format(len(name_set)))
    
initial_letters = string.ascii_uppercase
# Loop through initial letters (A-Z)
for letter in initial_letters:
  url3 = "https://rollerderbyroster.com/view-names/?ini={}".format(letter)
  print("Getting names from {}".format(url3))
  r3 = session.get(url3)
  d3 = r3.text
  soup3 = BeautifulSoup(d3, "lxml")
  
  rows3 = soup3.find_all('ul')
  # Use only last unordered list - this is where names are!
  for idx, li in enumerate(rows3[-1]):
    # Name should be the text of the link within the list item
    name = clean_name(li.find('a').get_text())
#     print(name)
    name_set.add(name)
  print("Downloaded {} names".format(len(name_set)))    

os.makedirs(model_name,exist_ok=True)
with open(training_file,"w") as names_file:
    name_list = list(name_set)
    print("Writing {} names to {}".format(len(name_list),training_file))
#     name_list.sort()
    random.shuffle(name_list)
    names_file.writelines("%s\n" % n for n in name_list)
#     files.download(training_file)

Getting names from https://www.twoevils.org/rollergirls/
Downloaded 40509 names
Getting names from http://www.derbyrollcall.com/everyone
Downloaded 69290 names
Getting names from https://rollerderbyroster.com/view-names/?ini=A
Downloaded 69397 names
Getting names from https://rollerderbyroster.com/view-names/?ini=B
Downloaded 69635 names
Getting names from https://rollerderbyroster.com/view-names/?ini=C
Downloaded 69790 names
Getting names from https://rollerderbyroster.com/view-names/?ini=D
Downloaded 70006 names
Getting names from https://rollerderbyroster.com/view-names/?ini=E
Downloaded 70058 names
Getting names from https://rollerderbyroster.com/view-names/?ini=F
Downloaded 70158 names
Getting names from https://rollerderbyroster.com/view-names/?ini=G
Downloaded 70234 names
Getting names from https://rollerderbyroster.com/view-names/?ini=H
Downloaded 70365 names
Getting names from https://rollerderbyroster.com/view-names/?ini=I
Downloaded 70416 names
Getting names from https://rol

In [6]:
"""
Multi-layer Recurrent Neural Networks (LSTM, RNN) for 
character-level language models in Python using Tensorflow 
and modified to work with tensorflow.js and ml5.js

Based on https://github.com/sherjilozair/char-rnn-tensorflow.
 
This script will train and dump the checkpoints to javascript
"""

from __future__ import print_function
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import tensorflow as tf
import logging

import argparse
import time
import glob
from six.moves import cPickle

# from utils import TextLoader
# from model import Model
from pprint import pprint

from six import text_type
# from json_checkpoint_vars import dump_checkpoints

# hide logs
tf.logging.set_verbosity(tf.logging.ERROR)

def main():
    parser = argparse.ArgumentParser(
                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--data_dir', type=str, default=model_name,
                        help='data directory containing input.txt')
    parser.add_argument('--save_model', type=str, default='models',
                        help='directory to store the ml5js model')
    parser.add_argument('--save_checkpoints', type=str, default='checkpoints',
                        help='directory to store checkpointed models')
    parser.add_argument('--log_dir', type=str, default='logs',
                        help='directory to store tensorboard logs')
    parser.add_argument('--rnn_size', type=int, default=128,
                        help='size of RNN hidden state')
    parser.add_argument('--num_layers', type=int, default=2,
                        help='number of layers in the RNN')
    parser.add_argument('--model', type=str, default='lstm',
                        help='rnn, gru, lstm, or nas')
    parser.add_argument('--batch_size', type=int, default=50,
                        help='minibatch size')
    parser.add_argument('--seq_length', type=int, default=50,
                        help='RNN sequence length')
    parser.add_argument('--num_epochs', type=int, default=50,
                        help='number of epochs')
    parser.add_argument('--save_every', type=int, default=1000,
                        help='save frequency')
    parser.add_argument('--grad_clip', type=float, default=5.,
                        help='clip gradients at this value')
    parser.add_argument('--learning_rate', type=float, default=0.002,
                        help='learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.97,
                        help='decay rate for rmsprop')
    parser.add_argument('--output_keep_prob', type=float, default=1.0,
                        help='probability of keeping weights in the hidden layer')
    parser.add_argument('--input_keep_prob', type=float, default=1.0,
                        help='probability of keeping weights in the input layer')
    parser.add_argument('--init_from', type=str, default=None,
                        help="""continue training from saved model at this path. Path must contain files saved by previous training process:
                            'config.pkl'        : configuration;
                            'chars_vocab.pkl'   : vocabulary definitions;
                            'checkpoint'        : paths to model file(s) (created by tf).
                                                  Note: this file contains absolute paths, be careful when moving files around;
                            'model.ckpt-*'      : file(s) with model definition (created by tf)
                        """)
    args = parser.parse_args(args=[])
    train(args)

def getModelVocab(path, model_name):
    # print("Getting the model's vocabulary")
    with open(os.path.join(path, model_name, 'chars_vocab.pkl'), 'rb') as f:
        chars, vocab = cPickle.load(f)
    return vocab

def train(args):
    model_name = args.data_dir.split("/")[-1]
    # make a dir to store checkpoints
    args.save_dir = os.path.join(args.save_checkpoints, model_name)
    os.makedirs(args.save_dir,exist_ok=True)
    
    data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length)
    args.vocab_size = data_loader.vocab_size

    # check compatibility if training is continued from previously saved model
    if args.init_from is not None:
        # check if all necessary files exist
        assert os.path.isdir(args.init_from)," %s must be a a path" % args.init_from
        assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from
        assert os.path.isfile(os.path.join(args.init_from,"chars_vocab.pkl")),"chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from
        ckpt = tf.train.get_checkpoint_state(args.init_from)
        assert ckpt, "No checkpoint found"
        assert ckpt.model_checkpoint_path, "No model path found in checkpoint"

        # open old config and check if models are compatible
        with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f:
            saved_model_args = cPickle.load(f)
        need_be_same = ["model", "rnn_size", "num_layers", "seq_length"]
        for checkme in need_be_same:
            assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme

        # open saved vocab/dict and check if vocabs/dicts are compatible
        with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f:
            saved_chars, saved_vocab = cPickle.load(f)
        assert saved_chars==data_loader.chars, "Data and loaded model disagree on character set!"
        assert saved_vocab==data_loader.vocab, "Data and loaded model disagree on dictionary mappings!"

    if not os.path.isdir(args.save_dir):
        os.makedirs(args.save_dir)
    with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(args, f)
    with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f:
        cPickle.dump((data_loader.chars, data_loader.vocab), f)

    model = Model(args)

    with tf.Session() as sess:
        # instrument for tensorboard
        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(
                os.path.join(args.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S")))
        writer.add_graph(sess.graph)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())
        # restore model
        if args.init_from is not None:
            saver.restore(sess, ckpt.model_checkpoint_path)
        for e in range(args.num_epochs):
            sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e)))
            data_loader.reset_batch_pointer()
            state = sess.run(model.initial_state)
            for b in range(data_loader.num_batches):
                start = time.time()
                x, y = data_loader.next_batch()
                feed = {model.input_data: x, model.targets: y}
                for i, (c, h) in enumerate(model.initial_state):
                    feed[c] = state[i].c
                    feed[h] = state[i].h

                # instrument for tensorboard
                summ, train_loss, state, _ = sess.run([summaries, model.cost, model.final_state, model.train_op], feed)
                writer.add_summary(summ, e * data_loader.num_batches + b)

                end = time.time()
                print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                      .format(e * data_loader.num_batches + b,
                              args.num_epochs * data_loader.num_batches,
                              e, train_loss, end - start))

                if (e * data_loader.num_batches + b) % args.save_every == 0\
                        or (e == args.num_epochs-1 and b == data_loader.num_batches-1):
                    # remove previous checkpoints
                    current_checkpoints = [f for f in os.listdir(args.save_dir) if os.path.isfile(os.path.join(args.save_dir, f))]
                    for f in current_checkpoints:
                        if model_name in f:
                            os.remove(os.path.join(args.save_dir, f))
                    # save for the last result
                    checkpoint_path = os.path.join(args.save_dir, model_name)
                    saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b)
                    final_model = '{}-{}'.format(model_name, e * data_loader.num_batches + b)
                    print("Model saved to {}!".format(checkpoint_path))

    # get the vocab
    model_vocab = getModelVocab(args.save_checkpoints, model_name)
    # dump the checkpoints to javascript
    dump_checkpoints(args.save_checkpoints, args.save_model, model_vocab, model_name, final_model)

if __name__ == '__main__':
    main()

Here we go! Reading text file...
0/19700 (epoch 0), train_loss = 5.162, time/batch = 1.417
Model saved to checkpoints/derby_names/derby_names!
1/19700 (epoch 0), train_loss = 5.140, time/batch = 0.037
2/19700 (epoch 0), train_loss = 5.104, time/batch = 0.036
3/19700 (epoch 0), train_loss = 4.980, time/batch = 0.039
4/19700 (epoch 0), train_loss = 4.654, time/batch = 0.034
5/19700 (epoch 0), train_loss = 4.217, time/batch = 0.032
6/19700 (epoch 0), train_loss = 3.973, time/batch = 0.033
7/19700 (epoch 0), train_loss = 3.829, time/batch = 0.032
8/19700 (epoch 0), train_loss = 3.720, time/batch = 0.027
9/19700 (epoch 0), train_loss = 3.704, time/batch = 0.027
10/19700 (epoch 0), train_loss = 3.626, time/batch = 0.028
11/19700 (epoch 0), train_loss = 3.650, time/batch = 0.028
12/19700 (epoch 0), train_loss = 3.543, time/batch = 0.027
13/19700 (epoch 0), train_loss = 3.562, time/batch = 0.027
14/19700 (epoch 0), train_loss = 3.498, time/batch = 0.033
15/19700 (epoch 0), train_loss = 3.525, 

In [9]:
import os
import zipfile

dir_name = 'models/{}'.format(model_name)
filePaths = []

for root, directories, files in os.walk(dir_name):
    for filename in files:
        # Create the full filepath by using os module.
        filePath = os.path.join(root, filename)
        filePaths.append(filePath)

zip_file = zipfile.ZipFile(dir_name+'.zip', 'w')
with zip_file:
  # writing each file one by one
  for file in filePaths:
    zip_file.write(file)
       
print(dir_name+'.zip file is created successfully!')

models/derby_names.zip file is created successfully!
