In [1]:
'''
  Reference : https://github.com/graykode/nlp-tutorial
'''

import tensorflow as tf
import numpy as np
import pandas as pd
import nltk
import os, re
from tqdm import tqdm

## Dataset

In [2]:
class Dataset:
    def load_directory_data(self, directory):
        data = {}
        data["sentence"] = []
        data["sentiment"] = []
        for file_path in tqdm(os.listdir(directory)):
            with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
                data["sentence"].append(f.read())
                data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
        return pd.DataFrame.from_dict(data)

    def load_dataset(self, directory):
        pos_df = self.load_directory_data(os.path.join(directory, "pos"))
        neg_df = self.load_directory_data(os.path.join(directory, "neg"))
        pos_df["polarity"] = 1
        neg_df["polarity"] = 0
        return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

    def download_and_load_datasets(self):
        dataset = tf.keras.utils.get_file(
          fname="aclImdb.tar.gz", 
          origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
          extract=True)
        
        train_df = self.load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "train"))
        test_df = self.load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "test"))
        return train_df, test_df

dataset = Dataset()
train_df, test_df = dataset.download_and_load_datasets()

100%|██████████| 12500/12500 [00:00<00:00, 22772.60it/s]
100%|██████████| 12500/12500 [00:00<00:00, 22199.33it/s]
100%|██████████| 12500/12500 [00:00<00:00, 21402.17it/s]
100%|██████████| 12500/12500 [00:00<00:00, 21914.88it/s]


In [3]:
train_df.head()

Unnamed: 0,sentence,sentiment,polarity
0,Tasteless. I can't even write intelligently ab...,1,0
1,Having watched 10 minutes of this movie I was ...,1,0
2,There are moments in this unique cartoon of pu...,4,0
3,"A good film, and one I'll watch a number of ti...",8,1
4,"In Stand By Me, Vern and Teddy discuss who was...",7,1


## Parameter

In [4]:
embedding_dim = 50
n_class = 2 
n_hidden = 50
sequence_length = n_step = 100
total_epoch=30
batch_size=128

## Input Preprocessing

In [5]:
def get_word_list(texts):
    import string
    word_list = []
    for text in tqdm(texts, disable = len(texts) < 10):
        for c in string.punctuation:
            text = text.replace(c,"")
        word_list.extend(text.lower().split(' '))
    return word_list

word_list = ['<eos>', '<pad>'] + get_word_list(train_df['sentence']) + get_word_list(test_df['sentence'])
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
vocab_size = len(word_dict)
print('vocab_size :', vocab_size)

100%|██████████| 25000/25000 [00:00<00:00, 30466.00it/s]
100%|██████████| 25000/25000 [00:00<00:00, 31730.08it/s]


vocab_size : 182792


In [6]:
train_X, test_X, train_y, test_y = [], [], [], []

def get_X(texts):
    X = []
    for text in texts:
        x = [word_dict[n] for n in get_word_list([text])][:sequence_length]
        if len(x) < sequence_length:
            x += [word_dict['<pad>']] * (sequence_length - len(x))
        X.append(np.asarray(x))
    return X

def get_y(polarities):
    y = []
    for out in polarities:
        y.append(np.eye(n_class)[out])
    return y
        

train_X = get_X(train_df['sentence'])
test_X = get_X(test_df['sentence'])

train_y = get_y(train_df['polarity'])
test_y = get_y(test_df['polarity'])

## Model

In [9]:
tf.reset_default_graph()

X = tf.placeholder(tf.int32, [None, n_step])
Y = tf.placeholder(tf.int32, [None, n_class])
out = tf.Variable(tf.random_normal([n_hidden*2, n_class]))

embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_dim]))
X_embedded = tf.nn.embedding_lookup(embedding, X) # [batch_size, n_step, embedding_dim]

lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)

output, final_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, X_embedded, dtype=tf.float32)
# output: [2, batch_size, n_step, n_hidden]
# output[0] : lstm_fw, output[1] : lstm_bw

# final_state[1]: LSTMStateTuple, final_state[1][0]: LSTM hidden state, final_state[1][1]: LSTM output

output = tf.concat([output[0], output[1]], 2) # output: [batch_size, n_step, n_hidden * 2]
final_hidden_state = tf.concat([final_state[1][0], final_state[1][1]], 1) # final_hidden_state : [batch_size, n_hidden * 2]]     
final_hidden_state = tf.expand_dims(final_hidden_state, 2) # final_hidden_state : [batch_size, n_hidden * 2, 1]     

attn_weights = tf.matmul(output, final_hidden_state)  # [batch_size, n_step, 1]
attn_weights = tf.squeeze(attn_weights, 2) # [batch_size, n_step]
soft_attn_weights = tf.nn.softmax(attn_weights, 1) # [batch_size, n_step]

output = tf.transpose(output, [0, 2, 1]) # [batch_size, n_hidden * 2, n_step]
soft_attn_weights_expanded = tf.expand_dims(soft_attn_weights, 2) # [batch_size, n_step, 1]
context = tf.matmul(output, soft_attn_weights_expanded)  # [batch_size, n_hidden * 2, 1]
context = tf.squeeze(context, 2) # [batch_size, n_hidden * 2]

model = tf.matmul(context, out)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

hypothesis = tf.nn.softmax(model)
is_correct = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))

final_state[1] : LSTMStateTuple(c=<tf.Tensor 'bidirectional_rnn/bw/bw/while/Exit_3:0' shape=(?, 50) dtype=float32>, h=<tf.Tensor 'bidirectional_rnn/bw/bw/while/Exit_4:0' shape=(?, 50) dtype=float32>)


## Train

In [8]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

total_batch = int(len(train_X) / batch_size)

for epoch in range(total_epoch):
    total_cost = 0
    for i in range(total_batch):
        batch_X = train_X[batch_size * i:batch_size * (i+1)]
        batch_y = train_y[batch_size * i:batch_size * (i+1)]
        _, loss = sess.run([optimizer, cost], feed_dict={X: batch_X, Y: batch_y})
        total_cost += loss
    
    import datetime
    time_str = datetime.datetime.now().isoformat()
    print('{}: '.format(time_str), 'Epoch :', '%04d' % (epoch), 'Avg. cost = ', '{:.4f}'.format(total_cost/ total_batch))
    
    if epoch % 5 == 0:
        print("\nEvaluation:")
        print('acc :', sess.run(accuracy, feed_dict={X: test_X, Y: test_y}))

2019-05-03T20:54:16.812126:  Epoch : 0000 Avg. cost =  0.6721

Evaluation:
acc : 0.74368
2019-05-03T20:54:30.613949:  Epoch : 0001 Avg. cost =  0.4161
2019-05-03T20:54:43.751618:  Epoch : 0002 Avg. cost =  0.3005
2019-05-03T20:54:56.891496:  Epoch : 0003 Avg. cost =  0.2353
2019-05-03T20:55:10.027332:  Epoch : 0004 Avg. cost =  0.1984
2019-05-03T20:55:23.194406:  Epoch : 0005 Avg. cost =  0.1619

Evaluation:
acc : 0.80616
2019-05-03T20:55:36.930170:  Epoch : 0006 Avg. cost =  0.1324
2019-05-03T20:55:50.070052:  Epoch : 0007 Avg. cost =  0.1080
2019-05-03T20:56:03.202011:  Epoch : 0008 Avg. cost =  0.0622
2019-05-03T20:56:16.369505:  Epoch : 0009 Avg. cost =  0.0567
2019-05-03T20:56:29.509454:  Epoch : 0010 Avg. cost =  0.0454

Evaluation:
acc : 0.71588
2019-05-03T20:56:43.265214:  Epoch : 0011 Avg. cost =  0.0525
2019-05-03T20:56:56.427623:  Epoch : 0012 Avg. cost =  0.0806
2019-05-03T20:57:09.560878:  Epoch : 0013 Avg. cost =  0.0180
2019-05-03T20:57:22.700452:  Epoch : 0014 Avg. cost

## Result

In [9]:
hypothesis = tf.nn.softmax(model)
is_correct = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))

print('Train acc :', sess.run(accuracy, feed_dict={X: train_X, Y: train_y}))
print('Test acc :', sess.run(accuracy, feed_dict={X: test_X, Y: test_y}))

Train acc : 0.9998
Test acc : 0.78844
