### Deep Sentiment Analysis Tutorial

### Setup GPU
- On Google Colab make sure you select Python 3/GPU runtime before running the code

In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


### Download Data

In [2]:
# !mkdir data/
# !wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data/

In [3]:
# %%time
# !tar -xzf data/aclImdb_v1.tar.gz -C data/

### Imports

In [4]:
import os
import re
import nltk
from collections import Counter
from tqdm import tqdm_notebook
import numpy as np
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.contrib.rnn import DropoutWrapper
import random

In [60]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bishal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
MAX_SEQ_LEN = 50
BATCH_SIZE = 64

In [7]:
class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        self.ipad = 0
        self.isos = 1
        self.ieos = 2
        self.iunk = 3
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
    
    def encodeSentence(self, wseq, max_len=-1):
        # wseq = nltk.tokenize.word_tokenize(s.lower().strip())
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        
    def encodeSentence2(self, wseq, max_len=-1):
        # wseq = nltk.tokenize.word_tokenize(s.lower().strip()) 
        return min(max_len, len(wseq)+1), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ' '.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

### Let's read in the data

In [8]:
data_folder = 'data/aclImdb/'

In [9]:
rp = os.path.join(data_folder, 'train/pos')
train_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'train/neg')
train_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

rp = os.path.join(data_folder, 'test/pos')
test_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'test/neg')
test_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

#### Limit number of samples
To quickly train a small model, consider setting n_train and n_test to some relatively small numbers e.g. `1000`. Set, 
`n_train = n_test = -1` to use all the samples available.

In [91]:
n_train = 10000
n_test = 2500

In [92]:
re_html_cleaner = re.compile(r"<.*?>")

In [93]:
en_counter = Counter()
train_data = []
for _fname in tqdm_notebook(train_positive[:n_train], desc="Crunching +ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        en_counter += Counter(wseq)
        train_data.append((wseq, 1))
        
for _fname in tqdm_notebook(train_negative[:n_train], desc="Crunching -ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        en_counter += Counter(wseq)
        train_data.append((wseq, 0))

HBox(children=(IntProgress(value=0, description='Crunching +ve samples: ', max=10000, style=ProgressStyle(desc…

HBox(children=(IntProgress(value=0, description='Crunching -ve samples: ', max=10000, style=ProgressStyle(desc…

In [94]:
test_data = []
for _fname in tqdm_notebook(test_positive[:n_test], desc="Crunching +ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        test_data.append((wseq, 1))
        
for _fname in tqdm_notebook(test_negative[:n_test], desc="Crunching -ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        test_data.append((wseq, 0))

HBox(children=(IntProgress(value=0, description='Crunching +ve samples: ', max=2500, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='Crunching -ve samples: ', max=2500, style=ProgressStyle(descr…

In [14]:
# A few sample english words
print("\nMost common en words in dataset:\n", en_counter.most_common(10))

print("\nTotal (en)words gathered from dataset:", len(en_counter))


Most common en words in dataset:
 [('the', 66942), (',', 55132), ('.', 54084), ('and', 32778), ('a', 32376), ('of', 29094), ('to', 27116), ('is', 22131), ('it', 19161), ('in', 18658)]

Total (en)words gathered from dataset: 46829


In [15]:
V = 10000

In [16]:
en_lang = Lang(en_counter, V)

In [17]:
wseq = nltk.tokenize.word_tokenize("Where are you going?".lower())
print("Test en encoding:", en_lang.encodeSentence(wseq))
print("Test en decoding:", en_lang.decodeSentence(en_lang.encodeSentence(wseq, 10)))

Test en encoding: [127, 32, 28, 182, 59]
Test en decoding: where are you going ?


### The RNN based Sentence Classifier architecture
- We will implement a RNN based classifier architecture for sentiment analysis in Tensorflow r1.13.1 / r1.14
- Debugging Tip: Always keep track of tensor dimensions!
- **Tensorflow Computation Graph** - We will build a tf computation graph first. This is the representation used by tf for any neural network architecture. Once the computation graph is built, you can feed data to it for training or inference

#### Word Embedding Matrix

In [18]:
en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (V, 300), dtype=tf.float32)

Instructions for updating:
Colocations handled automatically by placer.


#### Placeholders

In [19]:
keep_prob = tf.placeholder(tf.float32)

In [20]:
input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

In [21]:
y_placeholder = tf.placeholder(tf.int32, (None,))

#### Tensorflow Graphs

In [22]:
input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)

In [23]:
input_emb.shape

TensorShape([Dimension(None), Dimension(50), Dimension(300)])

#### Encoder

##### RNN Units

In [24]:
encoder_cell = tf.nn.rnn_cell.GRUCell(128)
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.


In [25]:
enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, input_emb, sequence_length=input_lens, initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [26]:
enc_outputs.shape

TensorShape([Dimension(64), Dimension(50), Dimension(128)])

In [27]:
enc_state.shape

TensorShape([Dimension(64), Dimension(128)])

### Classifier Layer

In [28]:
dense_layer = tf.layers.Dense(1)

#### Approaches:
As input to the final linear layers use mean of the hidden states?

or

As input to the final linear layers use the last hidden state?

##### Approch 1: Take mean of enc_outputs across dimension 1
- **IMPORTANT:** Need to **mask** the positions in input sentence that doesn't contain any inputs

In [45]:
masks = tf.sequence_mask(input_lens, MAX_SEQ_LEN, dtype=tf.float32, name='masks')
class_prob = tf.nn.sigmoid(dense_layer(tf.reduce_mean(enc_outputs*masks[:, :, None], 1))) 

In [46]:
class_prob.shape

TensorShape([Dimension(64), Dimension(1)])

##### Approch 2: Use enc_state (final hidden state)

In [31]:
# class_prob = dense_layer(enc_state) 
# class_prob.shape

#### Loss and Optimizers [softmax_cross_entropy]
Note that `onehot_labels` and `logits` must have the same shape, e.g. `[batch_size, num_classes]`

In [47]:
print(y_placeholder.shape)
print(class_prob.shape)

(?,)
(64, 1)


In [75]:
# Loss function - softmax cross entropy
y_ = tf.cast(y_placeholder[:, None], dtype=tf.float32)
cost = -y_*tf.log(class_prob + 1e-12) - (1-y_)*tf.log(1-class_prob + 1e-12)
cost = tf.reduce_mean(cost)
# cost = tf.losses.softmax_cross_entropy(
#     onehot_labels=tf.stack([1-y_placeholder, y_placeholder], 1), # Conversion to one-hot
#     logits=tf.concat([1-class_prob, class_prob], -1)
# )
# cost = tf.losses.sigmoid_cross_entropy(
#     y_placeholder[:, None], # Conversion to one-hot
#     class_prob
# )

# Optimizer
optimizer = tf.train.AdamOptimizer(0.0001)

In [76]:
train_op = optimizer.minimize(cost)

In [77]:
init = tf.global_variables_initializer()

#### Tensorflow Sessions

In [78]:
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

In [100]:
sess = tf.InteractiveSession(config=sess_config)
sess.run(init)

#### Minibatch Training

In [96]:
random.seed(41)

In [97]:
random.shuffle(train_data)

In [98]:
train_n = len(train_data)

In [99]:
test_n = len(test_data)

In [101]:
def small_test():
    all_true = []
    all_preds = []
    for m in range(0, test_n, BATCH_SIZE):
        n = m + BATCH_SIZE
        if n > test_n:
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        true_class_batch = np.zeros((BATCH_SIZE))
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(test_data[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b
            true_class_batch[i-m] = test_data[i][1]

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            keep_prob: 1.0
        }
        pred_batch = sess.run(class_prob, feed_dict=feed_dict)
        # acc = accuracy_score(true_class_batch, pred_batch > 0.5)
        all_true.extend(list(true_class_batch))
        all_preds.extend(list(pred_batch[:,0]))
    
    all_true = np.array(all_true)
    all_preds = np.array(all_preds)
    prec = precision_score(all_true, all_preds > 0.5)*100
    rec = recall_score(all_true, all_preds > 0.5)*100
    f1 = f1_score(all_true, all_preds > 0.5)*100
    print(f"Precision: {prec:2.2F}, Recall: {rec:2.2F}, F1-Score: {f1:2.2F}")

In [None]:
for _e in range(10):
    # Mix things up a bit.
    random.shuffle(train_data)
    pbar = tqdm_notebook(range(0, train_n, BATCH_SIZE))
    batch_loss = 0
    bxi = 0
    for m in pbar:
        n = m + BATCH_SIZE
        if n <= train_n:
            # print("Epoch Complete... \n")

            input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            true_class_batch = np.zeros((BATCH_SIZE))
            for i in range(m, n):
                b,a = en_lang.encodeSentence2(train_data[i][0], MAX_SEQ_LEN)
                input_batch[i-m,:] = a
                input_lens_batch[i-m] = b
                true_class_batch[i-m] = train_data[i][1]

            feed_dict={
                input_ids: input_batch,
                input_lens: input_lens_batch,
                y_placeholder: true_class_batch,
                keep_prob: 0.6
            }
            sess.run(train_op, feed_dict=feed_dict)
            batch_loss += sess.run(cost, feed_dict=feed_dict)
            pbar.set_description(f"Epoch: {_e} >> Loss: {batch_loss/(bxi+1):2.2F}:")
            bxi += 1
            if (1 + n//BATCH_SIZE) % 10 == 0:
                small_test()

HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

Precision: 54.80, Recall: 54.80, F1-Score: 54.80
Precision: 58.14, Recall: 55.28, F1-Score: 56.67
Precision: 57.43, Recall: 67.88, F1-Score: 62.22
Precision: 59.98, Recall: 61.28, F1-Score: 60.63
Precision: 61.16, Recall: 61.72, F1-Score: 61.44
Precision: 61.90, Recall: 63.56, F1-Score: 62.72
Precision: 61.02, Recall: 68.56, F1-Score: 64.57
Precision: 60.91, Recall: 70.68, F1-Score: 65.43
Precision: 62.27, Recall: 64.84, F1-Score: 63.53
Precision: 63.40, Recall: 62.16, F1-Score: 62.78
Precision: 62.79, Recall: 64.32, F1-Score: 63.54
Precision: 61.49, Recall: 72.68, F1-Score: 66.62
Precision: 68.15, Recall: 58.96, F1-Score: 63.22
Precision: 67.67, Recall: 69.08, F1-Score: 68.37
Precision: 67.62, Recall: 69.32, F1-Score: 68.46
Precision: 69.23, Recall: 66.60, F1-Score: 67.89
Precision: 68.70, Recall: 69.96, F1-Score: 69.32
Precision: 67.51, Recall: 73.24, F1-Score: 70.26
Precision: 73.58, Recall: 58.48, F1-Score: 65.17
Precision: 70.51, Recall: 68.00, F1-Score: 69.23
Precision: 67.86, Re

HBox(children=(IntProgress(value=0, max=313), HTML(value='')))

Precision: 73.01, Recall: 68.80, F1-Score: 70.84
Precision: 70.72, Recall: 72.84, F1-Score: 71.76
Precision: 73.19, Recall: 66.08, F1-Score: 69.46
Precision: 71.83, Recall: 69.88, F1-Score: 70.84
Precision: 68.18, Recall: 78.84, F1-Score: 73.12
Precision: 72.18, Recall: 73.28, F1-Score: 72.73
Precision: 73.02, Recall: 69.60, F1-Score: 71.27
Precision: 71.50, Recall: 72.16, F1-Score: 71.83
Precision: 72.33, Recall: 71.20, F1-Score: 71.76
Precision: 72.94, Recall: 69.32, F1-Score: 71.08
Precision: 70.06, Recall: 75.80, F1-Score: 72.81
Precision: 72.82, Recall: 68.04, F1-Score: 70.35
Precision: 72.24, Recall: 70.68, F1-Score: 71.45
Precision: 74.69, Recall: 65.64, F1-Score: 69.87
Precision: 71.51, Recall: 74.08, F1-Score: 72.77
Precision: 72.40, Recall: 71.36, F1-Score: 71.88
