# RNN Variable Length Text Classifier - Sequence 2 Sequence

In this Notebook, we'll extend our sequence classification model to do sequence-to-sequence learning. We’ll use the same dataset, but instead of having our model guess the author’s age bracket and gender at the end of the sequence (i.e., only once), we’ll have it guess at every timestep.

<a href="https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html#a-basic-model-for-sequence-to-sequence-learning">[Ref]</a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import math

import blogs_data #available at https://github.com/spitis/blogs_data

## Hyperparameters

In [2]:
DATA_PCT_LOADED = 0.06

MAX_SEQ_LEN = 30

STATE_SIZE = 64

BATCH_SIZE = 256

NUM_EPOCHS = 10

## Read data

In [3]:
df = blogs_data.loadBlogs().sample(frac=DATA_PCT_LOADED).reset_index(drop=True)
df.head(3)

Unnamed: 0,post_id,gender,age_bracket,string,as_numbers,length
0,135203,0,0,failed to capture the <UNK> killer who tried t...,"[1572, 5, 4450, 4, 0, 3202, 74, 434, 5, 2933, ...",22
1,157615,1,0,i am obsessed with eerie <UNK> and <UNK> . and...,"[3, 73, 3296, 26, 9980, 0, 6, 0, 2, 6, 341, 74...",20
2,42895,1,2,"we did some shopping after that at the mall , ...","[32, 80, 67, 932, 109, 9, 35, 4, 1224, 1, 6, 6...",15


In [4]:
vocab,reverse_vocab = blogs_data.loadVocab()
train_len, test_len = math.floor(len(df)*0.8), math.floor(len(df)*0.2)
train_len,test_len

(73382, 18345)

In [5]:
train = df.iloc[:train_len-1]
test = df.iloc[train_len:train_len + test_len]

In [6]:
train.head(2)

Unnamed: 0,post_id,gender,age_bracket,string,as_numbers,length
0,135203,0,0,failed to capture the <UNK> killer who tried t...,"[1572, 5, 4450, 4, 0, 3202, 74, 434, 5, 2933, ...",22
1,157615,1,0,i am obsessed with eerie <UNK> and <UNK> . and...,"[3, 73, 3296, 26, 9980, 0, 6, 0, 2, 6, 341, 74...",20


In [7]:
test.head(2)

Unnamed: 0,post_id,gender,age_bracket,string,as_numbers,length
73382,93043,1,1,then <UNK> asks me to be is date to a dinner p...,"[65, 0, 1963, 19, 5, 30, 14, 742, 5, 7, 528, 4...",14
73383,89410,0,1,i <UNK> with <UNK> to no avail .,"[3, 0, 26, 0, 5, 81, 8141, 2]",8


In [8]:
df = None

## Manage data

### Data iterator

In [9]:
class SimpleDataIterator():
    def __init__(self,df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()
        
    def shuffle(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True)
        self.cursor = 0
        
    def next_batch(self,n):
        if(self.cursor+n-1 > self.size):
            self.epochs += 1
            self.shuffle()
        res = self.df.iloc[self.cursor:self.cursor+n]
        self.cursor += n
        return res['as_numbers'],res['gender']*3 + res['age_bracket'],res['length']

In [10]:
data = SimpleDataIterator(train)
d = data.next_batch(3)
print('Input sequences\n', d[0], end='\n\n')
print('Target values\n', d[1], end='\n\n')
print('Sequence lengths\n', d[2])

Input sequences
 0    [4, 1003, 140, 14, 5, 59, 0, 6, 824, 5, 4, 533...
1    [1, 3, 75, 3, 214, 30, 5859, 57, 4, 257, 148, ...
2    [0, 7291, 1, 143, 54, 3099, 186, 5, 6985, 821,...
Name: as_numbers, dtype: object

Target values
 0    1
1    1
2    3
dtype: int64

Sequence lengths
 0    14
1    24
2    19
Name: length, dtype: int64


### Data padding

In [11]:
class BucketedDataIterator(SimpleDataIterator):
    
    def __init__(self,df,num_buckets=5):
        df = df.sort_values('length').reset_index(drop=True)
        self.size = len(df) // num_buckets
        self.dfs = []
        for bucket in range(num_buckets):
            self.dfs.append(df.iloc[bucket*self.size:(bucket+1)*self.size - 1])
        self.num_buckets = num_buckets
        
        #cursor[i] will be the cursor for the ith bucket
        self.cursor = np.array([0] * num_buckets)
        self.shuffle()
        self.epochs = 0
        
    def shuffle(self):
        #sorts dataframe by sequence length, but keeps it random within the same length
        for i in range(self.num_buckets):
            self.dfs[i] = self.dfs[i].sample(frac=1).reset_index(drop=True)
            self.cursor[i] = 0
    
    def next_batch(self,n):
        
        if(np.any(self.cursor+n+1 > self.size)):
            self.epochs += 1
            self.shuffle()
        
        i = np.random.randint(0,self.num_buckets)
    
        res = self.dfs[i].iloc[self.cursor[i]:self.cursor[i]+n]
        self.cursor[i] += n
        
        #Pad sequences with 0s so they are all the same length
        maxlen = max(res['length'])
        x = np.zeros([n,maxlen],dtype=np.int32)
        for i,x_i in enumerate(x):
            x_i[:res['length'].values[i]] = res['as_numbers'].values[i]
        
        return x,res['gender']*3 + res['age_bracket'], res['length']

In [12]:
data = BucketedDataIterator(train)
d = data.next_batch(3)
print('Input sequences\n',d[0],end='\n\n')

Input sequences
 [[  82 5021  209  116  118   21    0    6 1078   45    6    9   17  220
    25    0    0    0]
 [  43   54   17   28   55  127  205  395    1   54   17   11    0    2
    39    0    0    0]
 [4813   16 7447    7  213  791 1981   19    8    7    0  379    9   85
     7    0  298    2]]



## Model

In [28]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
    
def build_graph(vocab_size = len(vocab), state_size = 64, batch_size = 256, num_classes = 6):
    
    reset_graph()
    
    #Placeholders
    x = tf.placeholder(tf.int32,[batch_size,None]) #[batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32,[batch_size])
    y = tf.placeholder(tf.int32,[batch_size])
    keep_prob = tf.placeholder(1.0,name='keep_prob')
    
    #Tile the target indices
    y_ = tf.tile(tf.expand_dims(y,1),[1,tf.shape(x)[1]]) #[batch_size,num_steps]
    
    #Create a mask that we will use for the cost function
    lower_triangular_ones = tf.constant(value=np.tril(np.ones([MAX_SEQ_LEN,MAX_SEQ_LEN])),dtype=tf.float32)
    seqlen_mask = tf.slice(tf.gather(lower_triangular_ones,seqlen-1),[0,0],[batch_size,tf.reduce_max(seqlen)])
    
    #Embedding layer
    embeddings = tf.get_variable('embedding_matrix',[vocab_size,state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings,x)
    
    #RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state',[1,state_size],initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state,[batch_size,1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell,rnn_inputs,sequence_length=seqlen,initial_state=init_state)
    rnn_outputs = tf.nn.dropout(rnn_outputs,keep_prob) #Dropout
    
    #Last revelant output no needed anymore
    #last_rnn_output = tf.gather_nd(rnn_outputs,tf.stack([tf.range(batch_size),seqlen-1],axis=1))
    
    #Reshape rnn_outputs and y
    rnn_outputs = tf.reshape(rnn_outputs,[-1,state_size])
    y_reshaped = tf.reshape(y_,[-1])
    
    #Softmax layer - Prediction
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W',[state_size,num_classes])
        b = tf.get_variable('b',[num_classes],initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(rnn_outputs,W) + b
    preds = tf.nn.softmax(logits)
    
    #To calculate number correct, we want to count padded steps as incorrect
    correct = tf.cast(tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y_reshaped),tf.int32) *\
                tf.cast(tf.reshape(seqlen_mask, [-1]),tf.int32)
    
    
    
    #To calculate accuracy we want to divide by the number of non-paded time-steps, rather than taking the mean
    accuracy = tf.reduce_sum(tf.cast(correct,tf.float32)) / tf.reduce_sum(tf.cast(seqlen,tf.float32))
    
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y_reshaped)
    loss = loss * tf.reshape(seqlen_mask,[-1])
    
    #To calculate average loss we need to divide by number of non-padded time-steps, rather than taking the mean
    loss = tf.reduce_sum(loss) / tf.reduce_sum(seqlen_mask)
    
    #Optimizer
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
    
    ret_dict = {'x':x,'seqlen':seqlen,'y':y,'dropout':keep_prob,'loss':loss,'ts':train_step,'preds':preds,'accuracy':accuracy}
    
    return ret_dict

## Train function

In [29]:
def train_graph(graph,batch_size = 256, num_epochs = 10, iterator=BucketedDataIterator):
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tr = iterator(train)
        te = iterator(test)
        
        step,accuracy = 0,0
        tr_losses,te_losses = [],[]
        current_epoch = 0
        
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']:batch[0],g['y']:batch[1],g['seqlen']:batch[2],g['dropout']:0.6}
            
            accuracy_,_ = sess.run([g['accuracy'],g['ts']],feed_dict = feed)
            accuracy += accuracy_
            
            if(tr.epochs > current_epoch):
                current_epoch += 1
                tr_losses.append(accuracy/step)
                step,accuracy = 0,0
                
                #eval test set
                te_epoch = te.epochs
                while (te.epochs == te_epoch):
                    step =+ 1
                    batch = te.next_batch(batch_size)
                    feed = {g['x']:batch[0],g['y']:batch[1],g['seqlen']:batch[2],g['dropout']:1.0}
                    accuracy_ = sess.run([g['accuracy']],feed_dict = feed)[0]
                    accuracy += accuracy_
                    
                te_losses.append(accuracy/step)
                step,accuracy = 0,0
                print('Accuracy after epoch',current_epoch," - tr:", tr_losses[-1]," -te:", te_losses[-1])
                
    return tr_losses,te_losses

## Train & Test

In [30]:
g = build_graph(state_size=STATE_SIZE,batch_size=BATCH_SIZE)

In [31]:
train_graph(g,batch_size=BATCH_SIZE,num_epochs=NUM_EPOCHS)

Accuracy after epoch 1  - tr: 0.22006205223  -te: 14.8814923018
Accuracy after epoch 2  - tr: 0.23790044311  -te: 15.3518848717
Accuracy after epoch 3  - tr: 0.274168842421  -te: 14.7157956362
Accuracy after epoch 4  - tr: 0.288378841404  -te: 15.2641283125
Accuracy after epoch 5  - tr: 0.296289797159  -te: 16.6624484658
Accuracy after epoch 6  - tr: 0.303212320251  -te: 13.0666888058
Accuracy after epoch 7  - tr: 0.307922588902  -te: 13.9947309792
Accuracy after epoch 8  - tr: 0.314735962609  -te: 17.2314655781
Accuracy after epoch 9  - tr: 0.323095282339  -te: 14.3281701207
Accuracy after epoch 10  - tr: 0.331531378601  -te: 16.4985881597


([0.2200620522304457,
  0.23790044311007022,
  0.2741688424213366,
  0.28837884140425712,
  0.29628979715895143,
  0.3032123202513039,
  0.30792258890165436,
  0.31473596260902731,
  0.32309528233913276,
  0.33153137860113174],
 [14.881492301821709,
  15.351884871721268,
  14.715795636177063,
  15.264128312468529,
  16.662448465824127,
  13.066688805818558,
  13.994730979204178,
  17.231465578079224,
  14.328170120716095,
  16.498588159680367])