In [None]:
# Word Prediction using Recurrent Neural Networks (RNNs)
## Experiment 2016-12-30

Experiment setup

### Table of Contents

1. Initialize
2. Prepare Data
3. Explore Data
4. Experiments

## 1. Initialize
### Import

In [None]:
# import python modules
from __future__ import print_function, division
import os.path
import random

In [None]:
# import libraries (slow)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from nltk import tokenize

In [None]:
# import our wp modules
import sys; sys.path.append('../../src')
import wp

In [None]:
# reload wp modules in case changed (for development purposes)
reload(wp)
reload(wp.data)
reload(wp.util)
reload(wp.model)
reload(wp.ngram)
reload(wp.rnn)
reload(wp.experiment);

## 2. Prepare Data

Clean and merge raw text files, split into train, validate, and test sets.

In [None]:
# get wrapper around all data and tokenization
data = wp.data.Data('gutenbergs')

# clean, merge, split files
data.prepare()

In [None]:
# clean the raw data files - remove Gutenberg headers and footers, and non-ascii characters (nltk complains otherwise).
#data.clean()

# merge the cleaned data files into one.
#data.merge()

# split the merged file by sentences into train, validate, and test sets.
#data.split()

## 3. Explore Data

### Show some statistics

In [None]:
# slow
stats = data.analyze()
stats

### Show sentence lengths

In [None]:
nsentences = 100
nwordsmax = 100
df = data.histogram(nsentences, nwordsmax)

In [None]:
df.sort_index(axis=0,ascending=False).T.boxplot(vert=False);

In [None]:
plt.figure(figsize=(12,8))
plt.xlabel('Words/Sentence')
plt.xlim([0, nwordsmax])
sns.swarmplot(data=df.T, orient='h', split=True);

In [None]:
stop

### Show some samples of the text

In [None]:
s_merged = data.text('merged')
nsamples = 4
nchars = len(s_merged)
nskip = int(nchars / nsamples)
for i in range(nsamples):
    s = s_merged[i*nskip:i*nskip+200]
    s = s.replace('\n', ' ').strip()
    print(s)
    print()

### Show some text split into sentences

This shows how the text was split up into the train, validate, and test sets.

In [None]:
# we'll just look at the first 50k characters, because parsing sentences is slow
sentences = data.sentences('merged', 50000)
random.seed(2)
samples = random.sample(sentences, 4)
print('\n\n'.join(samples))

### Show the text split into tokens

Note that punctuation marks are treated as separate tokens.

In [None]:
tokens = data.tokens('merged', 50000)
print('ntokens',len(tokens))
print(tokens[-50:])

## 4. Experiments

Conduct some experiments

### 4.1 Test models on some test data

After preparing the 6mb of Gutenberg data, let's test out the models with a much smaller dataset first.

In [None]:
data = wp.data.Data('animals')
data.text()

In [None]:
# define models to train and test
model_specs = [
    [wp.ngram.Ngram, {'n':1}],
    [wp.ngram.Ngram, {'n':2}],
    [wp.ngram.Ngram, {'n':3}],
    [wp.ngram.Ngram, {'n':4}],
    [wp.rnn.Rnn, {'nvocabmax':10,'nhidden':10}],
]

In [None]:
# define parameters to run experiments on
params = {'train_amount':[0.5, 1.0]}

# create experiment
exper = wp.experiment.Experiment(model_specs, data, params)

In [None]:
# run it
exper.run()

Plot the results

### 4.2 Compare n-gram performance

In [None]:
# define models to train and test
model_specs = [
    [wp.ngram.Ngram, {'n':1}],
    [wp.ngram.Ngram, {'n':2}],
    [wp.ngram.Ngram, {'n':3}],
    [wp.ngram.Ngram, {'n':4}],
    [wp.rnn.Rnn, {'nvocabmax':1000,'nhidden':100}],
    [wp.rnn.Rnn, {'nvocabmax':10,'nhidden':10}],
]

In [None]:
# train models on different amounts of training data

train_amounts = [0.0001, 0.001, 0.01, 0.1, 1.0] # fraction of total training data

#nchars_list = [1000]#,10000,100000]#,1000000,6000000]
model_table = wp.analyze.init_model_table(model_specs, data, nchars_list)
print('done')

## 5. Test Models

Test all models on held-out test data.

In [None]:
# test all models and save results to a pandas dataframe

ntest_chars = 10000
npredictions_max = 1000
k = 3 # predict top k tokens

df = wp.analyze.test_model_table(model_table, data, ntest_chars, npredictions_max, k)

In [None]:
df

In [None]:
for i in range(len(df.index)):
    ix_i = df.ix[i]
    plt.plot(df.columns, ix_i)
plt.legend(loc=(1.1,0.5))
plt.xscale('log')
plt.xlabel('Training set size (chars)')
plt.ylabel('Accuracy')
plt.show()

## 6. Generate Text

In [None]:
nsentences = 5
models = model_table[-1] # use models with most training data
for model in models[1:]:
    print(model.name)
    print('-'*80)
    for seed in range(nsentences):
        random.seed(seed)
        s = model.generate()
        print(s)
        print()
    print()