In [9]:
# import file
from google.colab import files
uploaded = files.upload()

Saving yelp14_label.txt to yelp14_label.txt


In [0]:
import os
import sys
import codecs
import operator
import numpy as np
import re
from time import time
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Input, SpatialDropout1D
from keras.models import Model, Sequential
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
domain = 'yelp14'
data_path = './yelp14_text.txt'
text_path = './yelp14_text.txt'
score_path = './yelp14_label.txt'

In [0]:
# create a vocabulary
def prepare_data(data_path, vocab_size, skip_top=0, skip_len=0, replace_non_vocab=1):
    
    # what is skip_len? never been used after initiating
    # get all words, find vocab_size number of top occuring words and make the dictionary with their indices
    vocab = create_vocab(data_path, skip_len, vocab_size)

    # take each line and its corresponding label
    # take each word from the line and encode it according to the dictionary formed before
    # numbers and unknown words are indexed as stored in dictionary given below
    data, label, max_len = create_data(vocab, text_path, score_path, domain, skip_top, skip_len, replace_non_vocab)

    return vocab, data, label, max_len

In [0]:
num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')

def create_vocab(data_path, maxlen=0, vocab_size=0):
    print(domain)
    print('Creating vocab ...')
    total_words, unique_words = 0, 0
    word_freqs = {}

    fin = open(data_path)
    for line in fin:
        words = line.split()
        if maxlen > 0 and len(words) > maxlen:
            continue

        for w in words:
            if not bool(num_regex.match(w)):
                try:
                    word_freqs[w] += 1
                except KeyError:
                    unique_words += 1
                    word_freqs[w] = 1
                total_words += 1

    print ('  %i total words, %i unique words' % (total_words, unique_words))
    sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)

    vocab = {'<pad>':0, '<unk>':1, '<num>':2}
    index = len(vocab)
    for word, _ in sorted_word_freqs:
        vocab[word] = index
        index += 1
        if vocab_size > 0 and index > vocab_size + 2:
            break
    if vocab_size > 0:
        print (' keep the top %i words' % vocab_size)

  
    return vocab

In [0]:
def create_data(vocab, text_path, label_path, domain, skip_top, skip_len, replace_non_vocab):
    
    data = []
    label = [] # {pos: 0, neg: 1, neu: 2}
    
    f = codecs.open(text_path, 'r', 'utf-8')
    f_l = codecs.open(label_path, 'r', 'utf-8')
    
    num_hit, unk_hit, skip_top_hit, total = 0., 0., 0., 0.
    pos_count, neg_count, neu_count = 0, 0, 0
    max_len = 0
    
    for line, score in zip(f, f_l):
        
        word_indices = []
        words = line.split()

        if skip_len > 0 and len(words) > skip_len:
            continue

        score = float(score.strip())
        if score < 3:
            neg_count += 1
            label.append(1)
        elif score > 3:
            pos_count += 1
            label.append(0)
        else:
            neu_count += 1
            label.append(2)
          
        for word in words:
            if bool(num_regex.match(word)):
                word_indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                word_ind = vocab[word]
                if skip_top > 0 and word_ind < skip_top + 3:
                    skip_top_hit += 1
                else:
                    word_indices.append(word_ind)
            else:
                if replace_non_vocab:
                    word_indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        if len(word_indices) > max_len:
            max_len = len(word_indices)

        data.append(word_indices)

    f.close()
    f_l.close()

    print('  <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100*num_hit/total, 100*unk_hit/total))
    print( 'pos count: ', pos_count )
    print( 'neg count: ', neg_count )
    print( 'neu count: ', neu_count )

    return np.array(data), np.array(label), max_len

In [14]:
# start of the data preprocess
vocab, data_list, label_list, overall_maxlen = prepare_data(data_path, 10000)
print(overall_maxlen)
words_idx = vocab
idx_words = dict((v,k) for (k,v) in vocab.items())

print('printing word-index VS index-word')
print(list(idx_words.items())[:5])
print(list(vocab.items())[:5])


yelp14
Creating vocab ...
  3829257 total words, 43607 unique words
 keep the top 10000 words
  <num> hit rate: 0.87%, <unk> hit rate: 2.05%
pos count:  10000
neg count:  10000
neu count:  10000
1008
printing word-index VS index-word
[(0, '<pad>'), (1, '<unk>'), (2, '<num>'), (3, 'the'), (4, 'and')]
[('<pad>', 0), ('<unk>', 1), ('<num>', 2), ('the', 3), ('and', 4)]


In [15]:
# len(data_list) = 30000
data_size = len(data_list)

# print before shuffling
print('print BEFORE shuffling and categorizing the label values')
print(data_list[25])
print(label_list[25])

# Return evenly spaced values within a given interval.
rand_idx = np.arange(len(data_list))
# print(rand_idx)
np.random.shuffle(rand_idx)
# print(rand_idx)

# shuffle data with corresponding labels
data = data_list[rand_idx]
label_ = label_list[rand_idx]
label = to_categorical(label_)

print('\nprint AFTER shuffling and categorizing the label values')
print(data[25])
print('Label before categorizing: ', label_[25])
print(label[25][0], label[25][1], label[25][2])

print(data[590])
print('Label before categorizing: ', label_[590])
print(label[590][0], label[590][1], label[590][2])

# create train, validation and test set
test_x = data[0:1000]
test_y = label[0:1000]

dev_x = data[1000:5000]
dev_y = label[1000:5000]

train_x = data[5000:int(data_size)]
train_y = label[5000:int(data_size)]

print('\nprinting shape of train data')
print(train_x.shape, train_y.shape)

# make each instance of training data equal length by padding (why max length from the dev set??????)
mLength = np.max([len(d) for d in dev_x])
train_x_ = sequence.pad_sequences(train_x, mLength)
dev_x_ = sequence.pad_sequences(dev_x, mLength)
test_x_ = sequence.pad_sequences(test_x, mLength)
print('\nprinting after padding sequences')
print(mLength)

# convert to numpy arrays
train_x_ = np.array(train_x_)
train_y = np.array(train_y)

dev_x_ = np.array(dev_x_)
dev_y = np.array(dev_y)

test_x_ = np.array(test_x_)
test_y = np.array(test_y)
print(train_x_.shape)
print(train_y.shape)

print BEFORE shuffling and categorizing the label values
[5, 149, 21, 37, 5, 202, 218, 9, 7, 498, 15, 18, 134, 96, 9, 5618, 4, 9, 107, 6450, 15, 3, 32, 134, 207, 187, 4, 12, 134, 623, 23, 49, 15, 142, 47, 369, 203, 390, 15]
0

print AFTER shuffling and categorizing the label values
[16, 457, 12, 1, 8150, 5609, 5, 8, 3811, 7, 6050, 1, 259, 59, 13, 2, 80, 6557, 1, 6, 44, 100, 1270, 493, 1861, 5, 8, 2275, 7, 25, 1, 205, 1299, 7, 323, 1, 1, 8, 1409, 4, 100, 1771, 10, 16, 420, 11, 9925, 85, 2340, 9925, 11, 3, 2275, 130, 283, 5, 1228, 7, 1, 69, 7, 196, 40, 14, 16, 6880, 4, 2146, 28, 147, 24, 13, 3, 161, 5649, 2275, 11, 3, 1, 30, 5, 8, 928, 59, 7, 21, 1, 11, 8113, 5, 406, 150, 66, 8, 44, 1337, 4, 1503, 7, 1465, 3, 205, 80, 1512, 3, 9925, 59, 23]
Label before categorizing:  1
0.0 1.0 0.0
[475, 66, 12, 6, 35, 2444, 42, 618, 5, 8, 13, 3, 1044, 10, 531, 60, 608, 52, 66, 8, 445, 1755, 1, 4, 41, 633, 62, 321, 11, 9, 52, 343, 7, 452, 6, 51, 53, 318]
Label before categorizing:  1
0.0 1.0 0.0

printin

## Data iterator

In [0]:
class Dataiterator():
    '''
      1) Iteration over minibatches using next(); call reset() between epochs to randomly shuffle the data
      2) Access to the entire dataset using all()
    '''
    
    def __init__(self, X, y, seq_length=32, decoder_dim=300, batch_size=32):      
        self.X = X 
        self.y = y 
        self.num_data = len(X) # total number of examples
        self.batch_size = batch_size # batch size
        self.reset() # initial: shuffling examples and set index to 0
    
    def __iter__(self): # iterates data
        return self


    def reset(self): # initials
        self.idx = 0
        self.order = np.random.permutation(self.num_data) # shuffling examples by providing randomized ids 
        
    def __next__(self): # return model inputs - outputs per batch
        X_ids = [] # hold ids per batch 
        while len(X_ids) < self.batch_size:
            X_id = self.order[self.idx] # copy random id from initial shuffling
            X_ids.append(X_id)
            self.idx += 1 # 
            if self.idx >= self.num_data: # exception if all examples of data have been seen (iterated)
                self.reset()
                raise StopIteration()
        batch_X = self.X[np.array(X_ids)] # X values (encoder input) per batch
        batch_y = self.y[np.array(X_ids)] # y_in values (decoder input) per batch
        return batch_X, batch_y

          
    def all(self): # return all data examples
        return self.X, self.y

### LSTM Model for document level sentiment classification

In [0]:
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Input, Bidirectional
from keras.models import Model

### Input layer

In [0]:
### YOUR CODE HERE
sentence_input = Input(shape=(None,), dtype='int32', name='sentence_input')

### Layer to train embedding weights of words

In [19]:
### YOUR CODE HERE
vocab_size = len(words_idx)
word_emb = Embedding(vocab_size, 300, mask_zero=True, name='word_emb')
emb_output = word_emb(sentence_input)

Instructions for updating:
Colocations handled automatically by placer.


### RNN based layer

In [20]:
### YOUR CODE HERE
dropout= 0.5
recurrent_dropout = 0.1 
lstm_layer = LSTM(300, return_sequences=False, dropout=dropout, \
              recurrent_dropout=recurrent_dropout, name='lstm')(emb_output)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


### Prediction layer

In [0]:
### YOUR CODE HERE
densed = Dense(3, name='dense')(lstm_layer)
probs = Activation('softmax')(densed)

### Construct the model

In [0]:
### YOUR CODE HERE

model = Model(inputs=[sentence_input], outputs=probs)

In [0]:
import keras.optimizers as opt

In [24]:
optimizer = opt.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06, clipnorm=10, clipvalue=0)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sentence_input (InputLayer)  (None, None)              0         
_________________________________________________________________
word_emb (Embedding)         (None, None, 300)         3000900   
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dense (Dense)                (None, 3)                 903       
_________________________________________________________________
activation_1 (Activation)    (None, 3)                 0         
Total params: 3,723,003
Trainable params: 3,723,003
Non-trainable params: 0
_________________________________________________________________


### Training with batch generator

In [0]:
batch_size = 32
train_steps_epoch = len(train_x_)/batch_size
batch_train_iter = Dataiterator(train_x_, train_y, batch_size)
val_steps_epoch = len(dev_x_)/batch_size
batch_val_iter = Dataiterator(dev_x_, dev_y, batch_size)

In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def train_generator(model, batch_train_iter, batch_val_iter):
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=10),
                     ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                     monitor='val_loss', save_best_only=False, \
                                     save_weights_only=True)
                     ]
    
    def train_gen():
        while True:
            train_batches = [[X, y] for X, y in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch
                
    def val_gen():
        while True:
            val_batches = [[X, y] for X, y in batch_val_iter]
            for val_batch in val_batches:
                yield val_batch
                
    history = model.fit_generator(train_gen(), validation_data=val_gen(), \
                                  validation_steps=val_steps_epoch, steps_per_epoch=train_steps_epoch, \
                                  epochs = 20, callbacks = earlystop_callbacks)
      

In [20]:
train_generator(model, batch_train_iter, batch_val_iter)

Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


In [22]:
# start testing
# predictions = model.predict(test_x_, verbose = 0)
score,acc = model.evaluate(test_x_, test_y, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))


score: 0.82
acc: 0.79


### Bidirectional LSTM

In [27]:
sentence_input = Input(shape=(None,), dtype='int32', name='sentence_input')
vocab_size = len(words_idx)
word_emb = Embedding(vocab_size, 300, mask_zero=True, name='word_emb')
emb_output = word_emb(sentence_input)

dropout= 0.5
recurrent_dropout = 0.1 
lstm_layer = Bidirectional(LSTM(300, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout, name='bilstm'))(emb_output)
densed = Dense(3, name='dense')(lstm_layer)
probs = Activation('softmax')(densed)
biModel = Model(inputs=[sentence_input], outputs=probs)
optimizer = opt.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06, clipnorm=10, clipvalue=0)
biModel.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
biModel.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sentence_input (InputLayer)  (None, None)              0         
_________________________________________________________________
word_emb (Embedding)         (None, None, 300)         3000900   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 600)               1442400   
_________________________________________________________________
dense (Dense)                (None, 3)                 1803      
_________________________________________________________________
activation_2 (Activation)    (None, 3)                 0         
Total params: 4,445,103
Trainable params: 4,445,103
Non-trainable params: 0
_________________________________________________________________


In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
def train_generator(model, batch_train_iter, batch_val_iter):
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=10),
                 ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                 monitor='val_loss', save_best_only=False, \
                                 save_weights_only=True)
                 ]

    def train_gen():
        while True:
            train_batches = [[X, y] for X, y in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch

    def val_gen():
        while True:
          val_batches = [[X, y] for X, y in batch_val_iter]
          for val_batch in val_batches:
              yield val_batch

    bidirecthistory = biModel.fit_generator(train_gen(), validation_data=val_gen(), \
                              validation_steps=val_steps_epoch, steps_per_epoch=train_steps_epoch, \
                              epochs = 20, callbacks = earlystop_callbacks)

In [1]:
train_generator(biModel, batch_train_iter, batch_val_iter)

NameError: ignored

In [0]:
# # unidirectional LSTM model
# # input length  = num of time steps
# hidden_size = 64
# lstm_out = 3
# batch_size = 10000 #vocabulary

# model = Sequential()
# model.add(Embedding(batch_size, hidden_size, input_length = train_x_.shape[1]))
# model.add(LSTM(hidden_size, return_sequences = True))
# model.add(SpatialDropout1D(0.4))
# model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
# model.add(Dense(3,activation='sigmoid'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
# print(model.summary())