# Modeling Character Sequences using RNNs
## Author: Ankit Gupta

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
# helper function for reading from a file

def open_read(path):
    t = open(path, errors='ignore')
    has_period = False
    lines = []
    while True:
        line = t.readline()
        if not line:
            break
        if line.strip():
            lines.append(line.strip())
            if '.' in line:
                has_period = True
    if has_period:
        delimiter = ' '
    else:
        delimiter = '. '
    return (delimiter.join(lines)).strip()

In [3]:
# reading all the .txt files in the OANC corpus
import os

dirs = ['./datasets/text_generator/OANC/written_1/journal/slate/']
texts = []
files = []
for direc in dirs:
    for file in os.listdir(direc):
        path = os.path.join(direc, file)
        if os.path.isdir(path):
            dirs.append(path)
        elif path.endswith('.txt'):
            texts.append(open_read(path))
            files.append(path)

total_chars = 0
for t in texts:
    total_chars += len(t)

print('# .txt files', len(texts), 'total chars', total_chars)
print(' '.join(texts[:2]))

# .txt files 3913 total chars 21641381
Harmonic Convergences You're right, Maxim's strong point is that it's totally unsentimental and ungenteel. It's a sendup of the old model, but in a different way than, say, Hustler was, and the difference (surprise) reflects the sexual culture of the '90s. With its belligerent grossness and misogyny, Hustler rebelled against the establishment men's mags' class condescension, the earnest philosophizing about the sexual revolution, the "thinking men's sex bomb" syndrome, at the same time that it was deliberately goading feminists. It came right out with the anger that the regular men's mags tried to hide. Maxim pokes fun at its progenitors but with considerable ironic affection. It's not angry. In fact, while its fondness for the most idiotic, juvenile humor knows no bounds, any strong emotion is taboo (unless you count horror at having your penis mangled)--that's part of the British influence, I guess. And feminism isn't an issue, at least not dire

In [4]:
# using 10000 of the read files 

text = ' '.join(texts[:10000])
text_list = np.array(list(text))
print(len(text_list), text_list)

21645293 ['H' 'a' 'r' ..., 'r' 'i' 's']


In [5]:
np.unique(text_list)

array(['\t', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
       ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
       'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
       'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
       '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
       'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
       'z', '{', '}'],
      dtype='<U1')

In [6]:
from sklearn.preprocessing import LabelBinarizer

binarizer = LabelBinarizer()
print(text_list[19:24])
print(binarizer.fit_transform(text_list[19:24])) # see last two rows

# fitting the full dataset
binarizer.fit(text_list) 
print(len(binarizer.classes_))

['e' 's' ' ' 'Y' 'o']
[[0 0 1 0 0]
 [0 0 0 0 1]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 0 1 0]]
94


In [7]:
# encoding the data

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
print(text_list[19:24])
print(encoder.fit_transform(text_list[19:24])) #
encoder.fit(text_list)
print(len(encoder.classes_))

['e' 's' ' ' 'Y' 'o']
[2 4 0 1 3]
94


In [8]:
# preparing a big dataset - not very efficient to make a big dataset at once
n_steps = 100

def get_next_batch(epoch, batch, batch_size):
    np.random.seed(epoch)
    perm = np.random.permutation(len(text_list) - n_steps - 1) # starting indices of the sequnces of length n_steps + 1
    starts_indices = perm[batch*batch_size: (batch + 1)*batch_size]
    X_ret, y_ret = np.zeros( (batch_size, n_steps, len(binarizer.classes_)) ), np.zeros((batch_size, n_steps))
    
    for i in range(batch_size):
        start = starts_indices[i]
        X_ret[i], y_ret[i] = binarizer.transform(text_list[start: start + n_steps]), encoder.transform(text_list[start + 1: start + n_steps + 1])
    
    return X_ret.astype('int32'), y_ret.astype('int32')

X_temp, y_temp = get_next_batch(0, 0, 3)
print(X_temp.shape, y_temp.shape)

for i in range(len(y_temp[0])):
    print(np.argmax(X_temp[0, i, :]), end=' ')
print()
for i in range(len(y_temp[0])):
    print(y_temp[0, i], end=' ')

(3, 100, 94) (3, 100)
73 79 84 80 79 1 68 80 79 85 70 79 69 84 1 85 73 66 85 1 36 77 74 79 85 80 79 1 74 84 1 85 73 70 1 3 78 80 84 85 1 69 74 84 83 70 81 86 85 66 67 77 70 1 81 83 70 84 74 69 70 79 85 1 70 87 70 83 15 3 1 53 73 70 1 81 83 80 71 86 84 74 80 79 1 80 71 1 84 68 66 79 69 66 77 84 14 14 71 83 
79 84 80 79 1 68 80 79 85 70 79 69 84 1 85 73 66 85 1 36 77 74 79 85 80 79 1 74 84 1 85 73 70 1 3 78 80 84 85 1 69 74 84 83 70 81 86 85 66 67 77 70 1 81 83 70 84 74 69 70 79 85 1 70 87 70 83 15 3 1 53 73 70 1 81 83 80 71 86 84 74 80 79 1 80 71 1 84 68 66 79 69 66 77 84 14 14 71 83 80 

In [9]:
# constructing the RNN

n_inputs = len(binarizer.classes_)
n_outputs = n_inputs
n_neurons = 200
n_layers = 3

tf.reset_default_graph()

X = tf.placeholder(dtype='int32', shape=(None, n_steps, n_inputs), name='X') # one hot encoding of chars
y = tf.placeholder(dtype='int32', shape=(None, n_steps), name='y')  # integer labels of chars

layers = [tf.contrib.rnn.GRUCell(num_units=n_neurons, activation=tf.nn.tanh) for i in range(n_layers)]
multi_cell = tf.contrib.rnn.MultiRNNCell(layers)
# convert outputs of len n_neurons to n_outputs
multi_cell_opw = tf.contrib.rnn.OutputProjectionWrapper(multi_cell, output_size=n_outputs)
logits, states = tf.nn.dynamic_rnn(multi_cell_opw, tf.cast(X, dtype='float32'), dtype='float32')
print(logits)

# convert the output at each time step into probs using softmax 
probs = tf.nn.softmax(logits, name='probs')

#loss
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
loss = tf.reduce_mean(xentropy, name='loss')
print(xentropy.shape, loss.shape)

#optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss)

# saver
saver = tf.train.Saver()

init = tf.global_variables_initializer()

Tensor("rnn/transpose:0", shape=(?, 100, 94), dtype=float32)
(?, 100) ()


In [None]:
# running a tf session for training

sess = tf.InteractiveSession()
init.run()
epoch, batch = 0, 0

In [50]:
batch_size = 600

while batch < (len(text_list) - n_steps - 1) // batch_size:
    X_batch, y_batch = get_next_batch(epoch, batch, batch_size)
    sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
    if batch % 50 == 0:
        eval_loss = loss.eval(feed_dict={X: X_batch, y: y_batch})
        print('batch', batch, 'loss:', eval_loss, end='  ')
        #if eval_loss < 1.7:
        #    saver.save(sess, './datasets/savers/text_generator_chars/text_generator_chars')
    batch += 1

# loss should be < 1.6 for it to produce meaningfull words

batch 1500 loss: 1.45361  

KeyboardInterrupt: 

In [51]:
# final loss

print('loss:', loss.eval(feed_dict={X: X_batch, y: y_batch}))

loss: 1.48638


In [55]:
# saving the trained model

saver.save(sess, './datasets/savers/text_generator_chars/text_generator_chars')

'./datasets/savers/text_generator_chars/text_generator_chars'

In [88]:
# generating new sequence from the model

starting_context = 'The '

seed = list(starting_context)
output_length = 1000
top_k = 3
output_seq = seed[:]

while(len(output_seq) < output_length):
    # if needed pad the seed to make its length equal to n_steps 
    seed_padded = seed[:]
    for i in range(n_steps - len(seed)):
        seed_padded.append(' ')
    
    # reshape the padded seed 
    X_seed = np.zeros( (1, n_steps, len(binarizer.classes_)) )
    X_seed[0] = binarizer.transform(seed_padded)
    X_seed = X_seed.astype('int32')
    
    seed_probs = probs.eval(feed_dict={X: X_seed})[0]
    next_char_prob_distr = seed_probs[len(seed)-1]
    # to make sure we dont pick a character that has too small prob
    # we only keep top top_k chars
    
    top_k_probs = sorted(next_char_prob_distr)[-top_k:]
    for i in range(len(binarizer.classes_)):
        if next_char_prob_distr[i] < top_k_probs[0]:
            next_char_prob_distr[i] = 0
    # normalize
    next_char_prob_distr /= np.sum(next_char_prob_distr)
    
    # sample the next char label from this distribution
    next_char_label = np.random.choice(len(binarizer.classes_), p=next_char_prob_distr)
    next_char = encoder.inverse_transform(next_char_label)
    output_seq.append(next_char)
    
    # update seed
    seed.append(next_char)
    if len(seed) > n_steps:
        seed = seed[1:]
        
print(''.join(output_seq))

The New York Times , but we contract with the state in the story. The NYT and Texas, the New Yorker Can and New York Times , the New York Times , because the case of the secret off the party first, the president as the country, the start of the same actually are sent of any services are an endorsemory and to see their sense of the conservation of the second counter of their companience. It was a campaign to the present of the president. The New York Times lines to take a standing to the case to an antile that the state in the same touring to be a story is that his case offer and self-and tripping only be a street concern that they've a case of the same anyone who was the complete change of a care on the party of controssed to be able out on the same time in the same titled and the characterial considerable to the president was a contraction. The NYT and Newsweeks without a party on this in a contribution of the submit that the second second improval on this way of cannal to the contest