In [None]:
# author - Richard Liao 
# Dec 26 2016
import numpy as np
import pandas as pd

from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

os.environ['KERAS_BACKEND']='tensorflow'

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

#from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, Lambda, Concatenate, Reshape
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import matplotlib.pyplot as plt
%matplotlib inline




In [None]:
def gpu_alloc(device_id):
    os.environ["CUDA_VISIBLE_DEVICES"]=device_id
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

In [None]:
gpu_alloc("1")

# Data


In [None]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
NUM_EPOCHS = 100
BATCH_SIZE = 50

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

data_train = pd.read_csv('./dat/imdb/labeledTrainData.tsv', sep='\t')
print(data_train.shape)

from nltk import tokenize

reviews = []
labels = []
texts = []

for idx in range(data_train.review.shape[0]):
    print('Parsing review ' + str(idx))
    text = BeautifulSoup(data_train.review[idx]).get_text()
    text = clean_str(text)#.get_text().encode('ascii','ignore'))
    #print('Text:\n' + text)
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    '''
    for sent in sentences:
          print('Sentence:\n' + sent)
    '''
    labels.append(data_train.sentiment[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)


data_lst = []
labels_lst = []
for i, sentences in enumerate(reviews):
    data = np.zeros((MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[j,k] = tokenizer.word_index[word]
                    k=k+1
    data_lst.append(data)
    labels_lst.append(labels[i])
data = np.array(data_lst)
labels = np.array(labels_lst)


                    
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))



# Model


In [None]:

GLOVE_DIR = "./dat/glove"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

w_emb_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            #input_length=1,
                            trainable=True)
sent_enc_layer = Bidirectional(LSTM(100))
'''
w_input = Input(shape=(), dtype='int32')
w_emb = w_emb_layer(w_input)
w_emb = Reshape((1,w_emb.shape[1]))(w_emb)
w_emb_model = Model(w_input, w_emb)
'''

rev_enc_layer = Bidirectional(LSTM(100))

review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')

s_embs = []
for i in range(MAX_SENTS):
    sent = Lambda(lambda x: x[:,i,:])(review_input)#sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    '''
    w_embs = []
    for j in range(MAX_SENT_LENGTH):
        word = Lambda(lambda x: x[:,j])(sent)
        
        w_emb = w_emb_layer(word)
        #print(w_emb.shape)
        #print(w_emb._keras_shape)
        w_emb = Reshape((1,int(w_emb.shape[1])))(w_emb)
        #w_emb = K.expand_dims(w_emb, -2)
        #print(w_emb.shape)
        #rint(w_emb.shape[1])
        #w_emb = Reshape((1,100))(w_emb)
        #rint(w_emb._keras_shape)
        #w_emb = w_emb_model(word)
        w_embs.append(w_emb)
    
    #print(w_embs[0]._keras_shape)
    w_embs = Concatenate(axis=-2)(w_embs)
    #print(w_embs.shape)
    '''
    w_embs = w_emb_layer(sent)
    s_emb = sent_enc_layer(w_embs)
    #print(s_emb.shape)
    s_emb = Reshape((1, int(s_emb.shape[1])))(s_emb)
    s_embs.append(s_emb)

s_embs = Concatenate(axis=-2)(s_embs)
rev_emb = rev_enc_layer(s_embs)

preds = Dense(2, activation='softmax')(rev_emb)
model = Model(review_input, preds)
'''
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)
print(sentEncoder.summary())
review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
print(review_encoder.shape)
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
preds = Dense(2, activation='softmax')(l_lstm_sent)
model = Model(review_input, preds)
'''
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])



In [None]:
model.summary()

In [None]:

print("model fitting - Hierachical LSTM")
NUM_EPOCHS=2
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=NUM_EPOCHS, batch_size=BATCH_SIZE)


In [None]:

history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:

acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Test

In [None]:

data_test = pd.read_csv('./dat/imdb/labeledTrainData.tsv', sep='\t')
print(data_test.shape)


reviews = []
texts = []

for idx in range(data_test.review.shape[0]):
    print('Parsing review ' + str(idx))
    text = BeautifulSoup(data_test.review[idx]).get_text()
    text = clean_str(text)#.get_text().encode('ascii','ignore'))

    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)


data_lst = []
labels_lst = []
for i, sentences in enumerate(reviews):
    data = np.zeros((MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[j,k] = tokenizer.word_index[word]
                    k=k+1
    data_lst.append(data)
    labels_lst.append(labels[i])
data = np.array(data_lst)
#labels = np.array(labels_lst)                 
                    
test_input_data = data
test_texts = list(data_test.review.apply(BeautifulSoup).apply(BeautifulSoup.get_text).apply(clean_str))

In [None]:
for i, rev in enumerate(data_test.review):
    print(rev)
    test_input = test_input_data[i].copy()
    test_input = np.reshape(test_input, (1,test_input.shape[0], test_input.shape[1]))
    prediction = model.predict(test_input)
    print('Prediction: ', prediction)
    sentiment = np.argmax(prediction)
    print('Sentiment: ' + str(sentiment))