In [1]:
import os
from os import listdir, remove
from os.path import isfile, join
import numpy as np

## Download Dataset

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/aksdmj/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict

In [4]:
# divide filenames by its sentiment
# error-avoiding method
documents = defaultdict(list)
for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

In [5]:
# calculate maximum length of text
lens = [len(mr.raw(i)) for i  in mr.fileids()]
max_num_characters = max(lens)
max_num_characters

14957

In [6]:
# count the number of unique words used in all texts
# 70 non sapce characted is used in original paper.
unique_chars = len(set(mr.raw()))
unique_chars

74

In [7]:
chars = sorted(list(set(mr.raw())))

In [8]:
char_dict_inv = {v:i for i,v in enumerate(chars)}
char_dict_inv['!']

7

# Model

In [9]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe

In [10]:
tfe.enable_eager_execution(device_policy=tfe.DEVICE_PLACEMENT_SILENT)

# Data preprocess

In [11]:
# maximum character counts
l0 = 1014

In [12]:
def cut_or_padding(chars):
    if len(chars) > l0:
        # from back
        # return chars[-l0:]
        # from front
        return chars[:l0]
    else:
        return chars
def chars2indexs(chars):
    return np.array([char_dict_inv[char] for char in chars])

def one_hot(indexs):
    temp = np.zeros([l0, unique_chars], dtype="float64")
    for i, idx in enumerate(indexs):
        if(idx==-1):
            continue
        temp[i][idx] = 1.0
    return temp

def preprocess(document):
    indexs = chars2indexs(cut_or_padding(mr.raw(document)))
    temp = np.concatenate([-1*np.ones([l0 - indexs.shape[0]], dtype="int64"), indexs])
    return one_hot(temp)

In [13]:
X = list()
y = list()

# 0 label for negative , 1 for positive
for i in documents['neg']:
    tx = preprocess(i)
    X.append(tx)
    y.append(0)
    
for i in documents['pos']:
    tx = preprocess(i)
    X.append(tx)
    y.append(1)

In [14]:
# train-validation-test split with the ratio of (0.72, 0.18, 0.1)
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)


In [15]:
data_train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
data_val = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(len(y_val))
data_test = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(len(y_test))


In [16]:
class Character_CNN_Sentence(tf.keras.Model):
    """ Classifier for Movie Review dataset
    Args:
        num_words: uniwue charaters in dataset. 
        in_dim: dimension of input array, which is l0 in paper.
        out_dim: softmax output dimension.
    """
    def __init__(self,
                 num_chars=unique_chars,
                 in_dim= l0,
                 out_dim=2,
                 learning_rate=0.001,
                 checkpoint_directory="checkpoints/",
                 device_name="cpu:0"):
        super(Character_CNN_Sentence, self).__init__()
        self.in_dim = in_dim
        self.num_chars = num_chars
        self.out_dim = out_dim
        self.learning_rate = learning_rate
        self.checkpoint_directory = checkpoint_directory
        self.device_name = device_name
        
        self.conv11 = tf.layers.Conv1D(filters=1024, kernel_size=7, padding="valid")
        self.conv12 = tf.layers.Conv1D(filters=1024, kernel_size=7, padding="valid")
        self.conv13 = tf.layers.Conv1D(filters=1024, kernel_size=3, padding="valid")
        self.conv14 = tf.layers.Conv1D(filters=1024, kernel_size=3, padding="valid")
        self.conv15 = tf.layers.Conv1D(filters=1024, kernel_size=3, padding="valid")
        self.conv16 = tf.layers.Conv1D(filters=1024, kernel_size=3, padding="valid")

        self.maxpool1 = tf.layers.MaxPooling1D(pool_size=3, strides=3)
        self.maxpool2 = tf.layers.MaxPooling1D(pool_size=3, strides=3)
        self.maxpool3 = tf.layers.MaxPooling1D(pool_size=3, strides=3)

        self.flatten = tf.layers.Flatten()
        self.dropout = tf.layers.Dropout(0.5)

        self.fc1 = tf.layers.Dense(2048)        
        self.fc2 = tf.layers.Dense(2048)        
        self.out = tf.layers.Dense(self.out_dim)        
        
        
        # optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        
        # global step
        self.global_step = 0
        
        # verbose logging
        self.epoch_loss = 0
        
        # plotting
        self.train_losses = []
        self.val_losses = []
        self.test_accuracies = []
        

    def predict(self, X, training):
                
        x = self.conv11(X)
        x = self.maxpool1(x)
        x = self.conv12(x)
        x = self.maxpool2(x)
        x = self.conv13(x)
        x = self.conv14(x)
        x = self.conv15(x)
        x = self.conv16(x)
        x = self.maxpool3(x)
        
        x = self.flatten(x)

        x = self.fc1(x)
        if training:
            x = self.dropout(x)
            
        x = self.fc2(x)
        if training:
            x = self.dropout(x)
            
        pred = self.out(x)
        return pred

    def loss(self, X, y, training):
        prediction = self.predict(X, training)
        loss_val = tf.losses.sparse_softmax_cross_entropy(y, prediction)
        # add this for normalization... check otherwise easily overfitted.
        loss_val += tf.nn.l2_loss(self.out.weights[0])
        self.epoch_loss += loss_val.numpy()
        
        return loss_val

    def grad(self, X, y, training):
        with tfe.GradientTape() as tape:
            loss_val = self.loss(X, y, training)
        return tape.gradient(loss_val, self.variables) 
        
    def fit(self, train_data, validation_data, test_data, epochs=1, verbose=1, 
            batch_size=50, saving=False, early_stopping=0):
    
        train_data_batch = train_data.shuffle(100).batch(batch_size)
        
        with tf.device(self.device_name):
            for i in range(epochs):
                self.epoch_loss = 0
                for X, y in tfe.Iterator(train_data_batch):
                    grads = self.grad(X, y, True)
                    self.optimizer.apply_gradients(zip(grads, self.variables))
                    
                self.global_step += 1
                
                self.train_losses.append(self.epoch_loss)
                
                for X, y in tfe.Iterator(validation_data):
                    self.val_losses.append(self.loss(X, y, False).numpy())
                    
                    
                accuracy = tfe.metrics.Accuracy('train_acc')

                for X, y in tfe.Iterator(test_data):
                    logits = self.predict(X=X, training=False)
                    predictions = tf.argmax(logits, axis=1, output_type="int32")
                    accuracy(predictions, y)     
                    
                self.test_accuracies.append(accuracy.result().numpy())

                if (i+1)%verbose == 0 :
                    print("[EPOCH %d / STEP %d]" % ((i + 1), self.global_step))
                    print("Train loss : %s" % self.train_losses[-1])
                    print("Val   loss : %s" % self.val_losses[-1])
                    print("TEST accuracy: %.4f%%" % (self.test_accuracies[-1] * 100))
                    print()
                          

                accuracy.init_variables()
                
                if early_stopping:
                    ok = False
                    if len(self.val_losses) <= early_stopping:
                        continue
                    for i in range(early_stopping):
                        if self.val_losses[-(i+1)] < self.val_losses[-(i+2)]:
                            ok = True
                            break
                    if ok:
                        continue
                    print("early stopping on step %s" % self.global_step)
                    break
                                              
                    
                    
    def save(self):
        tfe.Saver(self.variables).save(self.checkpoint_directory, global_step=self.global_step)

    def load(self, global_step="latest"):
        # init
        self.predict(tf.convert_to_tensor(np.zeros([1, self.num_words]), dtype="float64"), False)

        saver = tfe.Saver(self.variables)
        if global_step == "latest":
            saver.restore(tf.train.latest_checkpoint(self.checkpoint_directory))
            self.global_step = int(tf.train.latest_checkpoint(self.checkpoint_directory).split('/')[-1][1:])
        else:
            saver.restore(self.checkpoint_directory + "-" + str(global_step))
            self.global_step = global_step

            

In [17]:
charcnn = Character_CNN_Sentence(device_name="gpu:0")

In [18]:
charcnn.fit(data_train, data_val,  data_test, epochs=100, verbose=5, early_stopping=False)

[EPOCH 5 / STEP 5]
Train loss : 60.09034442901611
Val   loss : 2.0160692
TEST accuracy: 47.5000%

[EPOCH 10 / STEP 10]
Train loss : 53.206207036972046
Val   loss : 1.9905567
TEST accuracy: 50.5000%

[EPOCH 15 / STEP 15]
Train loss : 52.241761803627014
Val   loss : 2.14337
TEST accuracy: 48.5000%

[EPOCH 20 / STEP 20]
Train loss : 49.60462129116058
Val   loss : 2.0144515
TEST accuracy: 55.0000%

[EPOCH 25 / STEP 25]
Train loss : 39.84679687023163
Val   loss : 2.1091306
TEST accuracy: 55.0000%

[EPOCH 30 / STEP 30]
Train loss : 32.776662945747375
Val   loss : 2.3623846
TEST accuracy: 54.5000%

[EPOCH 35 / STEP 35]
Train loss : 31.832338213920593
Val   loss : 2.446187
TEST accuracy: 57.5000%

[EPOCH 40 / STEP 40]
Train loss : 31.103715777397156
Val   loss : 2.59159
TEST accuracy: 55.0000%

[EPOCH 45 / STEP 45]
Train loss : 30.30930757522583
Val   loss : 2.4889421
TEST accuracy: 57.5000%

[EPOCH 50 / STEP 50]
Train loss : 29.477823615074158
Val   loss : 2.484789
TEST accuracy: 57.5000%

[E