In [1]:
'''
  Reference : https://github.com/graykode/nlp-tutorial
'''

import tensorflow as tf
import numpy as np
import pandas as pd
import nltk
import os, re
from tqdm import tqdm

## Dataset

In [2]:
class Dataset:
    def load_directory_data(self, directory):
        data = {}
        data["sentence"] = []
        data["sentiment"] = []
        for file_path in tqdm(os.listdir(directory)):
            with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
                data["sentence"].append(f.read())
                data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
        return pd.DataFrame.from_dict(data)

    def load_dataset(self, directory):
        pos_df = self.load_directory_data(os.path.join(directory, "pos"))
        neg_df = self.load_directory_data(os.path.join(directory, "neg"))
        pos_df["polarity"] = 1
        neg_df["polarity"] = 0
        return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

    def download_and_load_datasets(self):
        dataset = tf.keras.utils.get_file(
          fname="aclImdb.tar.gz", 
          origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
          extract=True)
        
        train_df = self.load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "train"))
        test_df = self.load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "test"))
        return train_df, test_df

dataset = Dataset()
train_df, test_df = dataset.download_and_load_datasets()

100%|██████████| 12500/12500 [00:00<00:00, 22212.95it/s]
100%|██████████| 12500/12500 [00:00<00:00, 21819.29it/s]
100%|██████████| 12500/12500 [00:00<00:00, 22109.83it/s]
100%|██████████| 12500/12500 [00:00<00:00, 22257.68it/s]


In [3]:
train_df.head()

Unnamed: 0,sentence,sentiment,polarity
0,"I never much liked the Myra movie, tho I appre...",3,0
1,Richard Attenborough who already given us magn...,7,1
2,"Yeah, well, I definitely had regrets about giv...",2,0
3,This was obviously the prototype for Mick Dund...,8,1
4,"As usual, Sean Connery does a great job. Lawre...",9,1


## Parameter

In [4]:
embedding_size = 50
sequence_length = 200
num_classes = 2 # 0 or 1
filter_sizes = [2,3,4] # n-gram window
num_filters = 3
batch_size=128
total_epoch=40

## Input Preprocessing

In [5]:
def get_word_list(texts):
    import string
    word_list = []
    for text in tqdm(texts, disable = len(texts) < 10):
        for c in string.punctuation:
            text = text.replace(c,"")
        word_list.extend(text.lower().split(' '))
    return word_list

word_list = ['<eos>', '<pad>'] + get_word_list(train_df['sentence']) + get_word_list(test_df['sentence'])
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
vocab_size = len(word_dict)
print('vocab_size :', vocab_size)

100%|██████████| 25000/25000 [00:00<00:00, 30712.08it/s]
100%|██████████| 25000/25000 [00:00<00:00, 31713.83it/s]


vocab_size : 182792


In [6]:
train_X, test_X, train_y, test_y = [], [], [], []

def get_X(texts):
    X = []
    for text in texts:
        x = [word_dict[n] for n in get_word_list([text])][:sequence_length]
        if len(x) < sequence_length:
            x += [word_dict['<pad>']] * (sequence_length - len(x))
        X.append(np.asarray(x))
    return X

def get_y(polarities):
    y = []
    for out in polarities:
        y.append(np.eye(num_classes)[out])
    return y
        

train_X = get_X(train_df['sentence'])
test_X = get_X(test_df['sentence'])

train_y = get_y(train_df['polarity'])
test_y = get_y(test_df['polarity'])

## Model

In [7]:
X = tf.placeholder(tf.int32, [None, sequence_length])
Y = tf.placeholder(tf.int32, [None, num_classes])

W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
embedded_chars = tf.nn.embedding_lookup(W, X) #[batch_size, sequence_length, embedding_size]
embedded_chars = tf.expand_dims(embedded_chars, -1) #[batch_size, sequence_length, embedding_size, 1]

pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
    filter_shape = [filter_size, embedding_size, 1, num_filters]
    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
    b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
    
    conv = tf.nn.conv2d(embedded_chars, # [batch_size, sequence_length, embedding_size, 1]
                        W,  # [filter_size(n-gram window), embedding_size, 1, num_filters]
                        strides=[1, 1, 1, 1], padding='VALID')
    # conv: [batch_size, sequence_length - filter_size + 1, 1, num_filters]
    
    h = tf.nn.relu(tf.nn.bias_add(conv, b))
    pooled = tf.nn.max_pool(h, 
                           ksize=[1, sequence_length - filter_size + 1, 1, 1],
                           strides=[1, 1, 1, 1],  # [batch_size, filter_height, filter_width, channel]
                           padding='VALID'
                          )
    # pooled: [batch_size, 1, 1, 3]
    pooled_outputs.append(pooled) 
    

num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(pooled_outputs, 3) # [batch_size, output_height(=1), output_width(=1), channel(=1) * 3]
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) # [batch_size, ]

Weight = tf.get_variable('W', shape=[num_filters_total, num_classes],initializer=tf.contrib.layers.xavier_initializer())
Bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
model = tf.nn.xw_plus_b(h_pool_flat, Weight, Bias)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.cast instead.


## Train

In [8]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

total_batch = int(len(train_X) / batch_size)

for epoch in range(total_epoch):
    total_cost = 0
    for i in range(total_batch):
        batch_X = train_X[batch_size * i:batch_size * (i+1)]
        batch_y = train_y[batch_size * i:batch_size * (i+1)]
        _, loss = sess.run([optimizer, cost], feed_dict={X: batch_X, Y: batch_y})
        total_cost += loss
        
    if epoch % 5 == 0:
        print('Epoch :', '%04d' % (epoch), 'Avg. cost = ', '{:.4f}'.format(total_cost/ total_batch))

Epoch : 0000 Avg. cost =  0.7026
Epoch : 0005 Avg. cost =  0.2757
Epoch : 0010 Avg. cost =  0.0827
Epoch : 0015 Avg. cost =  0.0216
Epoch : 0020 Avg. cost =  0.0066
Epoch : 0025 Avg. cost =  0.0026
Epoch : 0030 Avg. cost =  0.0011
Epoch : 0035 Avg. cost =  0.0006


## Result

In [9]:
hypothesis = tf.nn.softmax(model)

is_correct = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))
print('Train accuracy :', sess.run(accuracy, feed_dict={X: train_X, Y: train_y}))
print('Test accuracy :', sess.run(accuracy, feed_dict={X: test_X, Y: test_y}))

train accuracy : 0.9998
test accuracy : 0.77316
