# RNN Variable Length Text Classifier

In this Notebook, we’ll use Tensorflow to construct an RNN that operates on input sequences of variable lengths. We’ll use this RNN to classify bloggers by age bracket and gender using sentence-long writing samples. One time step will represent a single word, with the complete input sequence representing a single sentence. The challenge is to build a model that can classify multiple sentences of different lengths at the same time.

<a href="https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html">[Ref]</a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import math

import blogs_data #available at https://github.com/spitis/blogs_data

## Hyperparameters

## Read data

In [2]:
df = blogs_data.loadBlogs().sample(frac=1).reset_index(drop=True)
df.head(3)

Unnamed: 0,post_id,gender,age_bracket,string,as_numbers,length
0,16013,1,0,i have to babysit the bitter - beer - faced ba...,"[3, 23, 5, 6900, 4, 2894, 33, 1103, 33, 3272, ...",13
1,55584,0,2,"anyway , hopefully that will be enough <UNK> f...","[255, 1, 622, 9, 60, 30, 240, 0, 16, 4, 6894, ...",18
2,102139,1,2,"i keep thinking about all those <UNK> <UNK> , ...","[3, 239, 288, 47, 37, 161, 0, 0, 1, 612, 2]",11


In [3]:
vocab,reverse_vocab = blogs_data.loadVocab()
train_len, test_len = math.floor(len(df)*0.05), math.floor(len(df)*0.01)
train_len,test_len

(76440, 15288)

In [4]:
train = df.iloc[:train_len-1]
test = df.iloc[train_len:train_len + test_len]

In [5]:
train.head(2)

Unnamed: 0,post_id,gender,age_bracket,string,as_numbers,length
0,16013,1,0,i have to babysit the bitter - beer - faced ba...,"[3, 23, 5, 6900, 4, 2894, 33, 1103, 33, 3272, ...",13
1,55584,0,2,"anyway , hopefully that will be enough <UNK> f...","[255, 1, 622, 9, 60, 30, 240, 0, 16, 4, 6894, ...",18


In [6]:
test.head(2)

Unnamed: 0,post_id,gender,age_bracket,string,as_numbers,length
76440,169221,0,0,i figure this class will either be really easy...,"[3, 636, 29, 320, 60, 395, 30, 69, 614, 6, 220...",18
76441,93206,1,1,"luckily for him , <UNK> 's wife <UNK> heard wh...","[1942, 16, 84, 1, 0, 22, 802, 0, 433, 51, 58, ...",21


In [7]:
df = None

## Manage data

### Data iterator

In [8]:
class SimpleDataIterator():
    def __init__(self,df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()
        
    def shuffle(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True)
        self.cursor = 0
        
    def next_batch(self,n):
        if(self.cursor+n-1 > self.size):
            self.epochs += 1
            self.shuffle()
        res = self.df.iloc[self.cursor:self.cursor+n]
        self.cursor += n
        return res['as_numbers'],res['gender']*3 + res['age_bracket'],res['length']

In [9]:
data = SimpleDataIterator(train)
d = data.next_batch(3)
print('Input sequences\n', d[0], end='\n\n')
print('Target values\n', d[1], end='\n\n')
print('Sequence lengths\n', d[2])

Input sequences
 0    [4, 0, 2989, 88, 130, 2241, 11, 0, 6, 5142, 27...
1    [0, 1516, 63, 45, 1288, 0, 4160, 4160, 251, 57...
2    [16, 249, 1, 3, 80, 28, 59, 1527, 21, 5178, 22...
Name: as_numbers, dtype: object

Target values
 0    4
1    0
2    3
dtype: int64

Sequence lengths
 0    16
1    26
2    13
Name: length, dtype: int64


### Data padding

In [10]:
class PaddedDataIterator(SimpleDataIterator):
    def next_batch(self,n):
        if self.cursor+n > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.iloc[self.cursor:self.cursor+n]
        self.cursor += n
        
        #Pad sequences with 0s so they are all the same length
        maxlen = max(res['length'])
        x = np.zeros([n,maxlen],dtype=np.int32)
        for i,x_i in enumerate(x):
            x_i[:res['length'].values[i]] = res['as_numbers'].values[i]
        
        return x,res['gender']*3 + res['age_bracket'], res['length']

In [11]:
data = PaddedDataIterator(train)
d = data.next_batch(3)
print('Input sequences\n',d[0],end='\n\n')

Input sequences
 [[ 304   52   55   22  112  126   96    0    0 7318    2    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0]
 [  51    3   53 6581   14   48    3   73   90    5   71    5   76 3336
     8 3719   41   12 1259  192   11 3294 4516   21  106 1109  150    2]
 [ 230    4  915  196    1    4  244  954   14   47  874    8    4 2261
    24  452   10    2    0    0    0    0    0    0    0    0    0    0]]



## Model

In [12]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
    
def build_graph(vocab_size = len(vocab), state_size = 64, batch_size = 256, num_classes = 6):
    
    reset_graph()
    
    #Placeholders
    x = tf.placeholder(tf.int32,[batch_size,None]) #[batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32,[batch_size])
    y = tf.placeholder(tf.int32,[batch_size])
    keep_prob = tf.placeholder(1.0,name='keep_prob')
    
    #Embedding layer
    embeddings = tf.get_variable('embedding_matrix',[vocab_size,state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings,x)
    
    #RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state',[1,state_size],initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state,[batch_size,1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell,rnn_inputs,sequence_length=seqlen,initial_state=init_state)
    rnn_outputs = tf.nn.dropout(rnn_outputs,keep_prob) #Dropout
    
    #Last revelant output
    last_rnn_output = tf.gather_nd(rnn_outputs,tf.stack([tf.range(batch_size),seqlen-1],axis=1))
    
    #Softmax layer - Prediction
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W',[state_size,num_classes])
        b = tf.get_variable('b',[num_classes],initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(last_rnn_output,W) + b
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32),y)
    accuracy = tf.reduce_mean(tf.cast(correct,tf.float32))
    
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
    
    ret_dict = {'x':x,'seqlen':seqlen,'y':y,'dropout':keep_prob,'loss':loss,'ts':train_step,'preds':preds,'accuracy':accuracy}
    
    return ret_dict

## Train function

In [13]:
def train_graph(graph,batch_size = 256, num_epochs = 10, iterator=PaddedDataIterator):
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tr = iterator(train)
        te = iterator(test)
        
        step,accuracy = 0,0
        tr_losses,te_losses = [],[]
        current_epoch = 0
        
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']:batch[0],g['y']:batch[1],g['seqlen']:batch[2],g['dropout']:0.6}
            
            accuracy_,_ = sess.run([g['accuracy'],g['ts']],feed_dict = feed)
            accuracy += accuracy_
            
            if(tr.epochs > current_epoch):
                current_epoch += 1
                tr_losses.append(accuracy/step)
                step,accuracy = 0,0
                
                #eval test set
                te_epoch = te.epochs
                while (te.epochs == te_epoch):
                    step =+ 1
                    batch = te.next_batch(batch_size)
                    feed = {g['x']:batch[0],g['y']:batch[1],g['seqlen']:batch[2],g['dropout']:1.0}
                    accuracy_ = sess.run([g['accuracy']],feed_dict = feed)[0]
                    accuracy += accuracy_
                    
                te_losses.append(accuracy/step)
                step,accuracy = 0,0
                print('Accuracy after epoch',current_epoch," - tr:", tr_losses[-1]," -te:", te_losses[-1])
                
    return tr_losses,te_losses

## Train & Test

In [14]:
g = build_graph()

In [15]:
train_graph(g)

Accuracy after epoch 1  - tr: 0.219625313545  -te: 15.38671875
Accuracy after epoch 2  - tr: 0.250235947987  -te: 17.65625
Accuracy after epoch 3  - tr: 0.296953649329  -te: 18.21875
Accuracy after epoch 4  - tr: 0.313102978188  -te: 18.5859375
Accuracy after epoch 5  - tr: 0.320928586409  -te: 18.7734375
Accuracy after epoch 6  - tr: 0.330156774329  -te: 18.796875
Accuracy after epoch 7  - tr: 0.336868183725  -te: 18.56640625
Accuracy after epoch 8  - tr: 0.344628250839  -te: 18.40625
Accuracy after epoch 9  - tr: 0.353450083893  -te: 18.45703125
Accuracy after epoch 10  - tr: 0.359807571309  -te: 18.39453125


([0.2196253135451505,
  0.25023594798657717,
  0.29695364932885904,
  0.31310297818791949,
  0.32092858640939598,
  0.33015677432885904,
  0.33686818372483224,
  0.34462825083892618,
  0.35345008389261745,
  0.35980757130872482],
 [15.38671875,
  17.65625,
  18.21875,
  18.5859375,
  18.7734375,
  18.796875,
  18.56640625,
  18.40625,
  18.45703125,
  18.39453125])