In [1]:
# Reference : 
# https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/

import sys
import os

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
import pandas as pd
from collections import defaultdict
import re
import tensorflow as tf
import nltk

np.random.seed(42)
tf.set_random_seed(42)

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.layers import CuDNNLSTM, CuDNNGRU
# Merge
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
# from keras import initializations
from keras import initializers
from keras import regularizers
from keras import optimizers
from keras import constraints

from nltk import tokenize
# nltk.download('punkt')
from textblob import TextBlob

MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

Using TensorFlow backend.


## IMDB load

In [2]:
# import pickle

# def save_pickle(path, X):
#     with open(path, 'wb') as f:
#         pickle.dump(X, f)

# def open_pickle(path):
#     with open(path, 'rb') as f:
#         X = pickle.load(f)
#     return X

# X_train = open_pickle("../../data/imdb/imdb_original_preprocessed_xtrain.pickle")
# X_test = open_pickle("../../data/imdb/imdb_original_preprocessed_xtest.pickle")
# y_train = open_pickle("../../data/imdb/imdb_original_preprocessed_ytrain.pickle")
# y_test = open_pickle("../../data/imdb/imdb_original_preprocessed_ytest.pickle")

from dataset_load import *

path = r"..\..\data\reviews_Amazon_Instant_Video_5.json.gz"

X, y = extract_review_amazon(path, 'reviewText')
y_label = np.asarray(y)

neutral_indices = np.where(y_label == 3)[0]
y_label[y_label<3] = 0
y_label[y_label>3] = 1

X_discarded = np.delete(X,neutral_indices)
y_discarded = np.delete(y_label, neutral_indices)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit

# split
X_train_split, X_test_split, y_train, y_test = train_test_split(X_discarded, y_discarded, test_size=0.33, random_state=42)


# preprocessing
X_train = update_corpus_contraction(X_train_split)
X_test = update_corpus_contraction(X_test_split)

(75, 2)
corpus update start
corpus update end

(75, 2)
corpus update start
corpus update end



In [3]:
reviews = [] #sentences
test_reviews = []

for i in X_train:
    sentences = TextBlob(i).raw_sentences
    reviews.append(sentences)
    
for j in X_test:
    sentences = TextBlob(j).raw_sentences
    test_reviews.append(sentences)

In [4]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)

data = np.zeros((len(X_train), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
test_data = np.zeros((len(X_test), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')



In [5]:
# numerize token in Train data

print('start tokenize train...')
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j<MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                try:
                    if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                        data[i,j,k] = tokenizer.word_index[word]
                        k += 1
                except KeyError:
                    continue

# numerize token in Test data
print('start tokenize test...')
for i, sentences in enumerate(test_reviews):
    for j, sent in enumerate(sentences):
        if j<MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _,word in enumerate(wordTokens):
                try:
                    if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                        test_data[i,j,k] = tokenizer.word_index[word]
                        k += 1
                except KeyError:
                    continue

start tokenize train...
start tokenize test...


In [6]:
word_index = tokenizer.word_index

# Categorical
# labels = np.asarray(to_categorical(np.asarray(y_train)))
# labels_test = np.asarray(to_categorical(np.asarray(y_test)))

labels = y_train
labels_test = y_test

In [7]:
print('Total %s unique tokens.' % len(word_index))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

print('Shape of test data tensor:', test_data.shape)
print('Shape of label tensor:', labels_test.shape)

Total 46025 unique tokens.
Shape of data tensor: (22069, 15, 100)
Shape of label tensor: (22069,)
Shape of test data tensor: (10870, 15, 100)
Shape of label tensor: (10870,)


In [8]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [9]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

x_test = test_data
y_test = labels_test

In [10]:
print('Number of positive and negative reviews in training and validation set')
# print(y_train.sum(axis=0))
# print(y_val.sum(axis=0))

print(np.sum(y_train==0), np.sum(y_train==1))
print(np.sum(y_val==0), np.sum(y_val==1))
print(np.sum(y_test==0), np.sum(y_test==1))

Number of positive and negative reviews in training and validation set
1950 15706
489 3924
1164 9706


In [11]:
GLOVE_DIR = "../../data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'rb')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [12]:
# Define embedding matrix

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    
embedding_layer = Embedding(len(word_index)+1,
                               EMBEDDING_DIM,
                               weights=[embedding_matrix],
                               input_length=MAX_SENT_LENGTH,
                               trainable=True)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_gru = Bidirectional(GRU(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_gru)

# Sentence
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_gru_sent = Bidirectional(GRU(100))(review_encoder)

preds = Dense(1, activation='sigmoid')(l_gru_sent)
model = Model(review_input, preds)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

# print("model fitting - Hierarchical LSTM")
# print(model.summary())

In [13]:
print('model fitting - Hierarchical network')
history_imdb = model.fit(x_train, y_train, validation_data=(x_val, y_val),
           epochs=1, batch_size=1, verbose=1)

model fitting - Hierarchical network
Train on 17656 samples, validate on 4413 samples
Epoch 1/1


In [14]:
model.evaluate(x_test,y_test)



[0.24599738822086425, 0.8949402024467384]

In [15]:
model.evaluate(x_train,y_train)



[0.223088467443859, 0.9079632985953784]

In [16]:
import matplotlib.pyplot as plt

plt.plot(history_imdb.history['acc'])
plt.plot(history_imdb.history['val_acc'])
plt.plot(history_imdb.history['loss'], 'm--')
plt.plot(history_imdb.history['val_loss'], 'y--')
plt.title('model loss history (IMDB)')
plt.xlabel('epoch')
plt.legend(['tr_acc', 'te_acc','tr_loss', 'te_loss'], loc='upper left')
plt.show()
plt.clf()

<Figure size 640x480 with 1 Axes>

In [17]:
import matplotlib.pyplot as plt

plt.plot(history_amazon_video.history['acc'])
plt.plot(history_amazon_video.history['val_acc'])
plt.plot(history_amazon_video.history['loss'], 'm--')
plt.plot(history_amazon_video.history['val_loss'], 'y--')
plt.title('model loss history (Amazon Video)')
plt.xlabel('epoch')
plt.legend(['tr_acc', 'te_acc','tr_loss', 'te_loss'], loc='upper left')
plt.show()
plt.clf()

NameError: name 'history_amazon_video' is not defined