In [1]:
'''
  Reference : https://github.com/roomylee/rcnn-text-classification
'''

import tensorflow as tf
import numpy as np
import pandas as pd
import nltk
import os, re
from tqdm import tqdm

## Dataset

In [2]:
class Dataset:
    def load_directory_data(self, directory):
        data = {}
        data["sentence"] = []
        data["sentiment"] = []
        for file_path in tqdm(os.listdir(directory)):
            with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
                data["sentence"].append(f.read())
                data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
        return pd.DataFrame.from_dict(data)

    def load_dataset(self, directory):
        pos_df = self.load_directory_data(os.path.join(directory, "pos"))
        neg_df = self.load_directory_data(os.path.join(directory, "neg"))
        pos_df["polarity"] = 1
        neg_df["polarity"] = 0
        return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

    def download_and_load_datasets(self):
        dataset = tf.keras.utils.get_file(
          fname="aclImdb.tar.gz", 
          origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
          extract=True)
        
        train_df = self.load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "train"))
        test_df = self.load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "test"))
        return train_df, test_df

dataset = Dataset()
train_df, test_df = dataset.download_and_load_datasets()

100%|██████████| 12500/12500 [00:00<00:00, 22656.32it/s]
100%|██████████| 12500/12500 [00:00<00:00, 22315.48it/s]
100%|██████████| 12500/12500 [00:00<00:00, 22314.52it/s]
100%|██████████| 12500/12500 [00:00<00:00, 21673.04it/s]


In [3]:
train_df.head()

Unnamed: 0,sentence,sentiment,polarity
0,"The banner says it all, this is one really bad...",2,0
1,I had high expectations of this movie (the tit...,3,0
2,This is a sad movie about this woman who thoug...,1,0
3,"This is just typical Bruce Willis, action movi...",4,0
4,Bette Davis turns in a coldly amusing performa...,8,1


## Parameter

In [4]:
class Config:
    sequence_length = 200
    num_classes = 2 # 0 or 1
    
    batch_size=128
    total_epoch=15
    
    learning_rate = 0.001
    
    word_embedding_dim = 100
    context_embedding_dim = 256
    cell_type = 'gru'
    hidden_size = 256
    l2_reg_lambda = 0.5
    dropout_keep_prob = 0.7

## Input Preprocessing

In [5]:
def get_word_list(texts):
    import string
    word_list = []
    for text in tqdm(texts, disable = len(texts) < 10):
        for c in string.punctuation:
            text = text.replace(c,"")
        word_list.extend(text.lower().split(' '))
    return word_list

word_list = ['<eos>', '<pad>'] + get_word_list(train_df['sentence']) + get_word_list(test_df['sentence'])
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
vocab_size = len(word_dict)
print('vocab_size :', vocab_size)

100%|██████████| 25000/25000 [00:00<00:00, 30337.07it/s]
100%|██████████| 25000/25000 [00:00<00:00, 32522.85it/s]


vocab_size : 182792


In [6]:
train_X, test_X, train_y, test_y = [], [], [], []

def get_X(texts):
    X = []
    for text in texts:
        x = [word_dict[n] for n in get_word_list([text])][:Config.sequence_length]
        if len(x) < Config.sequence_length:
            x += [word_dict['<pad>']] * (Config.sequence_length - len(x))
        X.append(np.asarray(x))
    return X

def get_y(polarities):
    y = []
    for out in polarities:
        y.append(np.eye(Config.num_classes)[out])
    return y
        

train_X = get_X(train_df['sentence'])
test_X = get_X(test_df['sentence'])

train_y = get_y(train_df['polarity'])
test_y = get_y(test_df['polarity'])

## Model

In [7]:
class TextRCNN:
    def __init__(self, 
                 sequence_length, 
                 num_classes, 
                 vocab_size, 
                 word_embedding_size, 
                 context_embedding_size,
                 cell_type, 
                 hidden_size, 
                 l2_reg_lambda=0.0):
        
        self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text') # [batch_size, sequence_length]
        self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') # [batch_size, num_classes]
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') 
        
        l2_loss = tf.constant(0.0)
        text_length = self._length(self.input_text)
        
        # Embeddings
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W_text = tf.Variable(tf.random_uniform([vocab_size, word_embedding_size], -1.0, 1.0), name="W_text") # [vocab_size, word_embedding_size]
            self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_text)  # [batch_size, sequence_length, word_embedding_size]
        
        # Bidirectional(Left&Right) Recurrent Structure
        with tf.name_scope("bi-rnn"):
            fw_cell = self._get_cell(context_embedding_size, cell_type) 
            fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=self.dropout_keep_prob)
            bw_cell = self._get_cell(context_embedding_size, cell_type)
            bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=self.dropout_keep_prob)
            (self.output_fw, self.output_bw), states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=fw_cell,
                cell_bw=bw_cell,
                inputs=self.embedded_chars,
                sequence_length=text_length,
                dtype=tf.float32)
            
            # self.output_fw: [batch_size, sequence_length, context_embedding_dim]
            
        with tf.name_scope("context"):
            shape = [tf.shape(self.output_fw)[0], 1, tf.shape(self.output_fw)[2]] # [batch_size, 1, context_embedding_dim]
            self.c_left = tf.concat([tf.zeros(shape), self.output_fw[:, :-1]], axis=1, name="context_left")
            self.c_right = tf.concat([self.output_bw[:, 1:], tf.zeros(shape)], axis=1, name="context_right")
        
        with tf.name_scope("word-representation"):
            embedding_size = 2*context_embedding_size + word_embedding_size
            self.x = tf.concat([self.c_left, self.embedded_chars, self.c_right], axis=2, name="x") 
            # self.x : [batch_size, sequence_length, embedding_size(2*context_embedding_size + word_embedding_size)]

        with tf.name_scope("text-representation"):
            W2 = tf.Variable(tf.random_uniform([embedding_size, hidden_size], -1.0, 1.0), name="W2")
            b2 = tf.Variable(tf.constant(0.1, shape=[hidden_size]), name="b2")
            
            self.y2 = tf.einsum('aij,jk->aik', self.x, W2) # y2[a,i,k] = sum_j (x[a,i,j] * W2[j, k])
            self.y2 = tf.tanh(self.y2 + b2)
            # y2: [batch_size, sequence_length, hidden_size]

        with tf.name_scope("max-pooling"):
            self.y3 = tf.reduce_max(self.y2, axis=1)
            # y3: [batch_size, hidden_size]

        with tf.name_scope("output"):
            W4 = tf.get_variable("W4", shape=[hidden_size, num_classes], initializer=tf.contrib.layers.xavier_initializer())
            b4 = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b4")
            l2_loss += tf.nn.l2_loss(W4) ## sum(W4 ** 2) / 2
            l2_loss += tf.nn.l2_loss(b4) ## sum(b4 ** 2) / 2
            self.logits = tf.nn.xw_plus_b(self.y3, W4, b4, name="logits")
            self.predictions = tf.argmax(self.logits, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, axis=1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")

        
    @staticmethod
    def _get_cell(hidden_size, cell_type):
        if cell_type == "vanilla":
            return tf.nn.rnn_cell.BasicRNNCell(hidden_size)
        elif cell_type == "lstm":
            return tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
        elif cell_type == "gru":
            return tf.nn.rnn_cell.GRUCell(hidden_size)
        else:
            print("ERROR: '" + cell_type + "' is a wrong cell type !!!")
            return None
        
    # Length of the sequence data
    @staticmethod
    def _length(seq):
        # Ex) seq : [[1, 2, 3, 4], [5, 6, 7, 8]
        relevant = tf.sign(tf.abs(seq)) #   ->  [[1, 1, 1, 1], [1, 1, 1, 1]]
        length = tf.reduce_sum(relevant, reduction_indices=1) #   ->  [[4], [4]]
        length = tf.cast(length, tf.int32)
        return length
                                                                             

## Train

In [8]:

dev_sample_index = int(0.1 * float(len(test_X)))
dev_X = test_X[:dev_sample_index]
dev_y = test_y[:dev_sample_index]

tf.reset_default_graph()

rcnn = TextRCNN(
    sequence_length=Config.sequence_length,
    num_classes=Config.num_classes,
    vocab_size=vocab_size,
    word_embedding_size=Config.word_embedding_dim,
    context_embedding_size=Config.context_embedding_dim,
    cell_type=Config.cell_type,
    hidden_size=Config.hidden_size,
    l2_reg_lambda=Config.l2_reg_lambda
)      

# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
train_op = tf.train.AdamOptimizer(Config.learning_rate).minimize(rcnn.loss, global_step=global_step)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

total_batch = int(len(train_X) / Config.batch_size)

for epoch in range(Config.total_epoch):
    total_cost = 0
    for i in range(total_batch):
        batch_X = train_X[Config.batch_size * i:Config.batch_size * (i+1)]
        batch_y = train_y[Config.batch_size * i:Config.batch_size * (i+1)]
        _, step, loss, accuracy = sess.run([train_op, global_step, rcnn.loss, rcnn.accuracy], feed_dict={
            rcnn.input_text: batch_X, 
            rcnn.input_y: batch_y,
            rcnn.dropout_keep_prob: Config.dropout_keep_prob,
        })
        total_cost += loss
            
    print('Epoch :', '%04d' % (epoch), 
          'Avg. cost = ', '{:.4f}'.format(total_cost/ total_batch), 
          'Acc: {:.4f}'.format(accuracy))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_log

## Evaluation

In [36]:
 # Collect the predictions here
all_predictions = []
total_batch = int(len(test_X) / Config.batch_size)

predictions = []
answers = []

for i in  range(total_batch):
    batch_X = test_X[Config.batch_size * i:Config.batch_size * (i+1)]
    batch_y = test_y[Config.batch_size * i:Config.batch_size * (i+1)]
    batch_predictions = sess.run([rcnn.predictions], feed_dict={
        rcnn.input_text: batch_X, 
        rcnn.dropout_keep_prob: 1.0,
    })
    
    predictions += batch_predictions[0].tolist()
    answers += batch_y

from sklearn.metrics import classification_report
print(classification_report(np.argmax(answers, 1), predictions))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81     12484
           1       0.78      0.90      0.84     12476

   micro avg       0.82      0.82      0.82     24960
   macro avg       0.83      0.82      0.82     24960
weighted avg       0.83      0.82      0.82     24960

