In [1]:

import numpy as np
import pandas as pd
from numpy import array
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os
import string
os.environ['KERAS_BACKEND']='tensorflow'

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model, Sequential

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
from pickle import dump



Using TensorFlow backend.


In [2]:
def gpu_alloc(device_id):
    os.environ["CUDA_VISIBLE_DEVICES"]=device_id
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

In [3]:
gpu_alloc("1")

# Data

In [None]:
dat_path = '../dat/imdb/'
dat_file_name = 'imdb_master.csv'
dat_file = os.path.join(dat_path, dat_file_name)

In [None]:
def clean_str(text):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    text = text.lower()
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text]    
    return text.strip()


In [None]:
def split_train_test(df):
    df_train = df[df.train == 1]
    df_test= df[df.test == 1]
    return df_train.reviews, df_train.sentiment, df_test.reviews, df_test.sentiment

In [None]:
def load_data(dat_file):
    df = pd.read_csv(dat_file)
    df_train = df[df.train == 1]
    df_test= df[df.test == 1]
    return df_train.reviews, df_train.sentiment.apply(clean_str), df_test.reviews.apply(clean_str), df_test.sentiment

In [None]:
train_texts, train_labels, test_texts, test_labels = load_data(dat_file)
texts = train_texts + test_texts

## Fit tokenizer

In [None]:
VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
# save the tokenizer
tok_file_name = 'imdb_keras_tokenizer.pkl'
dump(tokenizer, open(tok_file_name, 'wb'))

# NLM
https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

## Data preparation

In [None]:
def prepare_lm_data(texts, seq_len):
    
    # organize into sequences of tokens
    length = seq_len + 1
    sequences = list()
    for i in range(length, len(texts)):
        # select sequence of tokens
        seq = texts[i-length:i]
        # convert into a line
        line = ' '.join(seq)
        # store
        sequences.append(line) 
        
    return sequences

In [None]:
def binarize_lm_data(texts, tokenizer):
    return tokenizer.texts_to_sequences(texts)

In [18]:
LM_DATA_SIZE = 200000
LM_SEQ_LEN = 50

text_sequences = prepare_lm_data(texts, seq_len)
sequences = binarize_lm_data(text_sequences, tokenizer)

sz_limit = LM_DATA_SIZE# len(sequences)

# separate into input and output
sequences = array(sequences[:sz_limit])
X, y = sequences[:,:-1], sequences[:,-1]

#y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

9741


## Model

In [None]:
EMBEDDING_DIM=100

In [None]:

def load_embeddings(embeddings_file):
    embeddings_index = {}
    f = open()
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()

    print('Total %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.random.random((vocab_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector    
    return embedding_matrix

In [None]:
# define model
#model = Sequential()
#model.add(Embedding(vocab_size, 50, input_length=seq_length))
#model.add(LSTM(100, return_sequences=True))
#model.add(LSTM(100))

GLOVE_DIR = "./dat/glove"

embeddings_file = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
embedding_matrix = load_embeddings(embeddings_file)
        
  
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)
sentence_input = Input(shape=(seq_length,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_word_enc = TimeDistributed(Dense(200))(l_lstm)

#model.add(Dense(100, activation='relu'))
l_dense = Dense(100, activation='relu')(l_word_enc)
#model.add(Dense(vocab_size, activation='softmax'))
output = Dense(vocab_size, activation='softmax')(l_dense)
model = Model(sentence_input, output)
print(model.summary())
word_enc = Model(sentence_input, l_word_enc)

# compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            487050    
_________________________________________________________________
lstm_5 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_6 (Dense)              (None, 9741)              983841    
Total params: 1,621,791
Trainable params: 1,621,791
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# fit model
model.fit(X, y, batch_size=128, epochs=100)

# save the model to file
lm_model_file_name = 'imdb_keras_lm_model.h5'
model.save(lm_model_file_name)
# save the tokenizer
tok_file_name = 'imdb_keras_tokenizer.pkl'
dump(tokenizer, open(tok_file_name, 'wb'))
word_enc_model_file_name = 'word_enc_model.h5'
word_enc.save(word_enc_model_file_name)

Epoch 1/100
  383872/10976162 [>.............................] - ETA: 5:14:00 - loss: 6.3967 - acc: 0.0818

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 1322624/10976162 [==>...........................] - ETA: 4:45:13 - loss: 6.0646 - acc: 0.1098

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 2/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  400512/10976162 [>.............................] - ETA: 2:42:22 - loss: 5.2093 - acc: 0.1721

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 1604736/10976162 [===>..........................] - ETA: 2:24:11 - loss: 5.2058 - acc: 0.1730

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 4/100
   85376/10976162 [..............................] - ETA: 2:49:05 - loss: 5.1698 - acc: 0.1754

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  649216/10976162 [>.............................] - ETA: 2:38:58 - loss: 5.1623 - acc: 0.1770

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 1346432/10976162 [==>...........................] - ETA: 2:28:13 - loss: 5.1619 - acc: 0.1769

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 7/100
Epoch 8/100
 2490496/10976162 [=====>........................] - ETA: 2:11:04 - loss: 5.0931 - acc: 0.1831

In [None]:
# Use LM
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences


# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# load cleaned text sequences

in_filename = out_filename
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

# load the model
model = load_model(lm_model_file_name)

# load the tokenizer
tokenizer = load(open(tok_file_name, 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

# HATT

## Data preparation

In [None]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000

In [None]:
from nltk import tokenize
def prepare_hier_data(in_texts, labels):
    
    reviews = []
    labels = []
    texts = []

    for idx in range(len(in_texts):
        print('Parsing review ' + str(idx))
        text = in_texts[idx]
        texts.append(text)
        sentences = tokenize.sent_tokenize(text)
        reviews.append(sentences)       
        labels.append(labels[idx])
    return reviews, labels

In [None]:
def binarize_hier_data(reviews, labels, tokenizer):
    data_lst = []
    labels_lst = []
    for i, sentences in enumerate(reviews):
        data = np.zeros((MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
        for j, sent in enumerate(sentences):
            if j< MAX_SENTS:
                wordTokens = text_to_word_sequence(sent)
                k=0
                for _, word in enumerate(wordTokens):
                    if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                        data[j,k] = tokenizer.word_index[word]
                        k=k+1
        data_lst.append(data)
        labels_lst.append(labels[i])
    data = np.array(data_lst)
    targets = np.array(labels_lst) 
    targets = to_categorical(np.asarray(targets))
    return data, targets

In [None]:
train_texts_, train_labels_ = prepare_hier_data(train_texts, train_labels)
train_data, train_targets = binarize_hier_data(train_texts_, train_labels_, tokenizer)

In [None]:
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', train_targets.shape)


## Split train/val


In [None]:
VALIDATION_SPLIT = 0.2


In [None]:

indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_targets = train_targets[indices]
nb_validation_samples = int(VALIDATION_SPLIT * train_data.shape[0])

x_train = train_data[:-nb_validation_samples]
y_train = train_targets[:-nb_validation_samples]
x_val = train_data[-nb_validation_samples:]
y_val = train_targets[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

## Model

In [None]:
embeddings_file_name = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')

# building Hierachical Attention network
embedding_matrix = load_embeddings(embeddings_file_name)

        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.he_normal()
        super(AttLayer, self).__init__(**kwargs)
    '''
    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.init((input_shape[-1],1))
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!
    '''
    def build(self, input_shape):
        assert len(input_shape)==3
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1], 1),
                                      initializer='uniform',
                                      trainable=True)
        super(AttLayer, self).build(input_shape)  # Be sure to call this at the end
    
    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/tf.expand_dims(K.sum(ai, axis=1), 1)
        
        weighted_input = x*weights
        return tf.reduce_sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer()(l_dense)
sentEncoder = Model(sentence_input, l_att)

review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
l_att_sent = AttLayer()(l_dense_sent)
preds = Dense(2, activation='softmax')(l_att_sent)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])




In [None]:
NUM_EPOCHS = 100
BATCH_SIZE = 50

In [None]:
print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=NUM_EPOCHS, batch_size=BATCH_SIZE)

## Test

In [None]:
test_texts_, test_labels_ = prepare_hier_data(test_texts, test_labels)
test_data, test_targets = binarize_hier_data(test_texts_, test_labels_, tokenizer)

In [None]:
for i, rev in enumerate(test_texts_):
    print(rev)
    test_input = test_data[i].copy()
    test_input = np.reshape(test_input, (1,test_input.shape[0], test_input.shape[1]))
    prediction = model.predict(test_input)
    print('Prediction: ', prediction)
    sentiment = np.argmax(prediction)
    print('Sentiment: ' + str(sentiment))