# RNN

Read texts, train an RNN and plot results

Adapted from https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py


In [None]:
# import python modules

from __future__ import print_function, division
import sys
print(sys.version)
import os
import os.path
import random
import codecs
import re

In [None]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from nltk import tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

In [None]:
# set parameters

np.random.seed(0)

BASE_DIR = '..'
TEXT_DIR = BASE_DIR + '/data/gutenbergs/1-raw' #. move to parentdir
GLOVE_DIR = BASE_DIR + '/_vectors/glove.6B'
EMBEDDING_DIM = 50
GLOVE_FILE = GLOVE_DIR + '/glove.6B.%dd.txt' % EMBEDDING_DIM
NVOCAB = 10000
VALIDATION_SPLIT = 0.05
TEST_SPLIT = 0.05

In [None]:
# read texts
print('Reading texts')
text = ''
for filename in sorted(os.listdir(TEXT_DIR)):
    filepath = TEXT_DIR +'/' + filename
    print(filepath)
    encoding = 'utf-8'
    with codecs.open(filepath, 'r', encoding=encoding, errors='ignore') as f:
        s = f.read()
        s = s.replace('\r\n','\n')
        text += s
print('done')

In [None]:
# split text into paragraphs, shuffle, and recombine
paragraphs = re.split(r"\n\n+", text)
print(len(paragraphs)) # 22989
random.shuffle(paragraphs)
text = '\n\n'.join(paragraphs)
del paragraphs
text[:1000] # show sample text

In [None]:
# tokenize text into word indexes
texts = [text] # just one giant text
tokenizer = Tokenizer(nb_words=NVOCAB)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(sequences[0][:100])

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print('a:', word_index['a'])

In [None]:
# clear some memory
del text
del texts

In [None]:
# get word vectors
print('Reading word vectors...')
word_vectors = {}
with open(GLOVE_FILE, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = coefs
print('Found %s word vectors.' % len(word_vectors))

In [None]:
print('a:',word_vectors['a'])
print(list(word_vectors.keys())[:10])

In [None]:
# build embedding matrix of the top nvocab words
nwords = min(NVOCAB, len(word_index))
E = np.zeros((nwords + 1, EMBEDDING_DIM))
for word, iword in word_index.items():
    if iword > NVOCAB:
        continue
    word_vector = word_vectors.get(word)
    # words not found in embedding index will be all zeros
    if word_vector is not None:
        E[iword] = word_vector

In [None]:
print(len(E))
print(E[:3])

In [None]:
# clear some memory
del word_vectors