In [None]:
import numpy as np
import pandas as pd
import _pickle as cPickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import numpy as np
import sys
import os

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping,ReduceLROnPlateau, ModelCheckpoint, TensorBoard

from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPool1D, merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, Dense, Input, Flatten, CuDNNGRU,CuDNNLSTM, concatenate, Lambda 

from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

from Attention import AttentionLayer

In [None]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
drop_rate=0.25
ATTENTION_TYPE='local'

In [None]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


data_train = pd.read_csv('labeledTrainData.tsv', sep='\t')
print (data_train.shape)

from nltk import tokenize

reviews = []
labels = []
texts = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx])
    text = clean_str(text.get_text())
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)

    labels.append(data_train.sentiment[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))

In [None]:
embeddings_index = {}
f = open(os.path.join('glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)


In [None]:
#SENTENCE LEVEL
sent_ints = Input(shape=(None,))
sent_wv = Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],
                    weights=[embedding_matrix],
                    input_length=MAX_SENT_LENGTH, # sentence size vary from batch to batch
                    trainable=False
                    )(sent_ints)

sent_wv_dr = Dropout(drop_rate)(sent_wv)
sent_wa = Bidirectional(CuDNNGRU(units=50,return_sequences=True),merge_mode='concat',weights=None)(sent_wv_dr)

sent_att_vec,sent_att_coeffs = AttentionLayer(return_coefficients=True, attention_type=ATTENTION_TYPE)(sent_wa) # attentional vector for the sentence
sent_att_vec_dr = Dropout(drop_rate)(sent_att_vec)                      
sent_encoder = Model(sent_ints,sent_att_vec_dr)

print(np.shape(sent_encoder))
print(sent_wa.shape)
print(sent_att_vec_dr.shape)

In [None]:
#DOCUMENT LEVEL
doc_ints = Input(shape=(None,None,))        
sent_att_vecs_dr = TimeDistributed(sent_encoder)(doc_ints)

doc_sa = Bidirectional(CuDNNGRU(units=50,return_sequences=True),merge_mode='concat',weights=None)(sent_att_vecs_dr)

doc_att_vec,doc_att_coeffs = AttentionLayer(return_coefficients=True, attention_type=ATTENTION_TYPE)(doc_sa) # attentional vector for the document
doc_att_vec_dr = Dropout(drop_rate)(doc_att_vec)

print(sent_att_vecs_dr.shape)
print(doc_sa.shape)
print(doc_att_vec_dr.shape)

In [None]:
n_cats=np.shape(y_train)[1]
preds = Dense(units=n_cats,
              activation='softmax')(doc_att_vec_dr)

han = Model(doc_ints,preds)
# so that we can just load the initial weights instead of redifining the model later on
#han.save_weights(path_to_save + 'han_init_weights')

print(preds.shape)

In [None]:
han.compile(loss='categorical_crossentropy',
            optimizer='Adam',
            metrics=['accuracy'])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [None]:
reduce=''
early=''

early_stop=''
reduce_lr=''

tensorboard=''
checkpoint=''


In [None]:
history = han.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=50)