In [14]:
import numpy as np
import os
import pandas as pd
import pickle
import time

os.environ['KERAS_BACKEND']='cntk'
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential
from keras import regularizers
from keras.optimizers import SGD
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import History, CSVLogger
from keras.utils import to_categorical

Download the Amazon reviews training data from a public Azure blob

In [13]:
CONTAINER_URL = "https://anargyridsa.blob.core.windows.net/dlvm/"
trainFile = "amazon_reviews_train.csv"

In [9]:
# read
train_data = pd.read_csv(CONTAINER_URL + trainFile, header=None, names=['rating', 'title', 'text'])

Set the dimensions of the input and the embedding. 

MAX_DOC_LENGTH : the size of the input i.e. the number of words in the document. Longer documents will be truncated, shorter ones will be padded with zeros.

VOCAB_SIZE : the size of the word encoding (number of most frequent words to keep in the vocabulary)

EMBEDDING_DIM : the dimensionality of the word embedding

In [None]:
MAX_DOC_LENGTH = 300
VOCAB_SIZE = 6000
EMBEDDING_DIM = 200

In [16]:
TEXT_COL = 'text'
LABEL_COL = 'rating'

Fit a Keras tokenizer to the most frequent words using the entire training data set as the corpus.

In [34]:
# tokenize, create seqs, pad
tok = Tokenizer(num_words=VOCAB_SIZE, lower=True, split=" ")
tok.fit_on_texts(train_data[TEXT_COL])
train_seq = tok.texts_to_sequences(train_data[TEXT_COL])
train_seq = sequence.pad_sequences(train_seq, maxlen=MAX_DOC_LEN)
test_seq = tok.texts_to_sequences(test_data[TEXT_COL])
test_seq = sequence.pad_sequences(test_seq, maxlen=MAX_DOC_LEN)

In [None]:
np.save('train_seq', train_seq)
np.save('test_seq', test_seq)

Convert the ratings to one-hot categorical labels.

In [None]:
labels = to_categorical(np.asarray(train_data[LABEL_COL]))
labels = labels[:,1:]
labels = labels.astype('float32')

In [20]:
print('Number of reviews by class in training set')
print(y_train.sum(axis=0))
n_classes = y_train.shape[1]

Number of reviews by class in training set
[ 600000.  600000.  600000.  600000.  600000.]


Train word2vec on all the documents in order to initialize the word embedding. Ignore rare words (min_count=6). Use skip-gram as the training algorithm (sg=1).

In [None]:
import nltk 

nltk.download('punkt')

sent_lst = []

for doc in train_data[TEXT_COL]:
    sentences = nltk.tokenize.sent_tokenize(doc)
    sent_lst.extend(sentences)

In [57]:
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# use skip-gram
word2vec_model = gensim.models.Word2Vec(sentences=sent_lst, min_count=6, size=EMBEDDING_DIM, sg=1, workers=os.cpu_count())

2017-08-21 22:29:22,266 : INFO : collecting all words and their counts
2017-08-21 22:29:22,266 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-21 22:29:22,432 : INFO : PROGRESS: at sentence #10000, processed 768914 words, keeping 16356 word types
2017-08-21 22:29:22,593 : INFO : PROGRESS: at sentence #20000, processed 1558434 words, keeping 18518 word types
2017-08-21 22:29:22,761 : INFO : PROGRESS: at sentence #30000, processed 2356584 words, keeping 19297 word types
2017-08-21 22:29:22,927 : INFO : PROGRESS: at sentence #40000, processed 3151418 words, keeping 19590 word types
2017-08-21 22:29:23,091 : INFO : PROGRESS: at sentence #50000, processed 3928526 words, keeping 19754 word types
2017-08-21 22:29:23,253 : INFO : PROGRESS: at sentence #60000, processed 4704430 words, keeping 19837 word types
2017-08-21 22:29:23,416 : INFO : PROGRESS: at sentence #70000, processed 5484612 words, keeping 19896 word types
2017-08-21 22:29:23,576 : INFO : PROGRES

Create the initial embedding matrix from the output of word2vec.

In [58]:
embeddings_index = {}

for word in word2vec_model.wv.vocab:
    coefs = np.asarray(word2vec_model.wv[word], dtype='float32')
    embeddings_index[word] = coefs

print('Total %s word vectors.' % len(embeddings_index))

# Initial embedding
embedding_matrix = np.zeros((VOCAB_SIZE + 1, EMBEDDING_DIM))

for word, i in tok.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i < VOCAB_SIZE:
        embedding_matrix[i] = embedding_vector

Total 20000 word vectors.


LSTM_DIM is the dimensionality of each LSTM output (the number of LSTM units).
The mask_zero option determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents. CNTK / Keras does not support masking yet.

In [16]:
BATCH_SIZE = 100
NUM_EPOCHS = 10
LSTM_DIM = 100
OPTIMIZER = SGD(lr=0.01, nesterov=True)

In [37]:
def lstm_create_train(reg_param):
    l2_reg = regularizers.l2(reg_param)

    # model init
    embedding_layer = Embedding(VOCAB_SIZE,
                                EMBEDDING_DIM,
                                input_length=MAX_DOC_LENGTH,
                                trainable=True,
                                mask_zero=False,
                                embeddings_regularizer=l2_reg,
                                weights=[embedding_matrix])

    lstm_layer = LSTM(units=LSTM_DIM, kernel_regularizer=l2_reg)
    dense_layer = Dense(n_classes, activation='softmax', kernel_regularizer=l2_reg)

    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(lstm_layer))
    model.add(dense_layer)

    model.compile(loss='categorical_crossentropy',
                  optimizer=OPTIMIZER,
                  metrics=['acc'])

    history = History()
    csv_logger = CSVLogger('./lstm_model_wvec_{}_log'.format(reg_param),
                           separator=',',
                           append=True)

    t1 = time.time()
    # model fit
    model.fit(train_seq,
              labels.astype('float32'),
              batch_size=BATCH_SIZE,
              epochs=NUM_EPOCHS,
              callbacks=[history, csv_logger],
              verbose=2)
    t2 = time.time()

    # save model
    model.save('./lstm_wvec_{}_model.h5'.format(reg_param))
    np.savetxt('./lstm_wvec_{}_time.txt'.format(reg_param), 
               [reg_param, (t2-t1) / 3600])
    with open('./lstm_wvec_{}_history.txt'.format(reg_param), "w") as res_file:
        res_file.write(str(history.history))

In [None]:
lstm_create_train(1e-10)

Epoch 1/10
