In [1]:
%load_ext autoreload
%autoreload 2

import helper
import numpy as np

In [2]:
# read in training data
train_data_path = '\\\\SEAGATE-D4/Documents/My Hoang Nguyen/ML-SDrive/Sentiment Analysis/data/labeledTrainData.tsv/labeledTrainData.tsv'
train = helper.read_in_data(train_data_path)

print(train.shape)
train.head()

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
train['review'][0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [4]:
# read in test data
test_data_path = '\\\\SEAGATE-D4/Documents/My Hoang Nguyen/ML-SDrive/Sentiment Analysis/data/testData.tsv/testData.tsv'
test = helper.read_in_data(test_data_path)

print(test.shape)
test.head()

(25000, 2)


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [5]:
# load Google's pre-trained word2vec
pretrained_word2vec_path = '\\\\SEAGATE-D4/Documents/My Hoang Nguyen/ML-SDrive/Sentiment Analysis/data/GoogleNews-vectors-negative300.bin'
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format(pretrained_word2vec_path, binary=True)        



In [6]:
# for k in word2vec.vocab.keys():
#     print(k)
#     print(word2vec.vocab[k])
# #     break

In [7]:
# peek into word2vec
print(len(word2vec['around_pig_Ramfjord']))
word2vec['around_pig_Ramfjord'][:10]

300


array([ 0.265625  ,  0.02001953, -0.01037598,  0.10449219, -0.03613281,
        0.03540039,  0.09228516, -0.03295898, -0.03588867,  0.06738281], dtype=float32)

In [8]:
# format raw data, specific to this dataset, to sequences of word_rank by creating a tokenizer
from keras.preprocessing.text import Tokenizer

data = train['review'].tolist()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)

Using TensorFlow backend.


In [9]:
# create train & validation data
from keras.preprocessing.sequence import pad_sequences

sequences = tokenizer.texts_to_sequences(data) # list, same len as data. represent word as rank/index
data_train = pad_sequences(sequences)
print('data_train.shape', data_train.shape)
'''
data_train has shape of (n_reviews, len(longest review)). 
Each review contains multiple sentences, each word in a sentence is toenkized into an integer representing 
its rank, aka the word_index
'''

from keras.utils import to_categorical
labels = to_categorical(np.asarray(train['sentiment']))

# split the data into a training set and a validation set
VALIDATION_SPLIT = 0.2
indices = np.arange(data_train.shape[0])
np.random.shuffle(indices)
data_train = data_train[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data_train.shape[0])

x_train = data_train[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data_train[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('x_train.shape', x_train.shape)
print('y_train.shape', y_train.shape)
print('x_val.shape', x_val.shape)
print('y_val.shape', y_val.shape)

data_train.shape (25000, 2493)
x_train.shape (20000, 2493)
y_train.shape (20000, 2)
x_val.shape (5000, 2493)
y_val.shape (5000, 2)


In [10]:
# create embedding_matrix
word_index = tokenizer.word_index
vocab_size = len(word_index)
EMBEDDING_DIM = 300 # this is from the pretrained vectors

embedding_matrix = np.zeros((vocab_size + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec:
        embedding_vector = word2vec[word]
    else:
        embedding_vector = None
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
# create embedding layer
from keras.layers import Embedding

input_length = data_train.shape[1] # len (num words) of longest review
embedding_layer = Embedding(vocab_size + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=input_length
                            , trainable=False) 

In [12]:
# # a small 1D convnet
# from keras.layers import Dense, Input, Flatten
# from keras.layers import Conv1D, MaxPooling1D
# from keras.models import Model

# n_labels = 2
# sequence_input = Input(shape=(input_length,), dtype='int32')
# embedded_sequences = embedding_layer(sequence_input)
# x = Conv1D(128, 5, activation='relu')(embedded_sequences)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(35)(x)  # global max pooling
# x = Flatten()(x)
# x = Dense(128, activation='relu')(x)
# preds = Dense(n_labels, activation='softmax')(x)

# model = Model(sequence_input, preds)
# model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

# # happy learning!
# model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)

In [13]:
# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

# import tensorflow as tf

# # Creates a session with log_device_placement set to True.
# sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

# # Creates a graph.
# a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
# b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
# c = tf.matmul(a, b)

# # Runs the op.
# print(sess.run(c))

# Kim CNN

###### - Apply CNN non-static, initialized with word2vec

In [14]:
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
from keras.models import Model

In [18]:
# parameters
n_labels = 2
filter_sizes = [2,3,4]
n_filters = 3
n_hidden_units = 128
batch_size = 128

# input
sequence_input = Input(shape=(input_length,), dtype='int32')
# embedding: vector representation of word
embedded_sequences = embedding_layer(sequence_input)
# conv layer
features = []
for filter_size in filter_sizes:
    # conv layer
    conv = Conv1D(n_filters, filter_size, activation='relu')(embedded_sequences)
    # global max pooling
    conv = GlobalMaxPooling1D()(conv)
    # add features together
    features.append(conv)
# penultimate layer
nn = Concatenate()(features)
# dropout
nn = Dropout(0.5)(nn)
# fully connected layer
nn = Dense(n_hidden_units, activation='relu')(nn)
preds = Dense(n_labels, activation='softmax')(nn)

model = Model(sequence_input, preds)
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=batch_size)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 2493)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 2493, 300)     26574900    input_3[0][0]                    
____________________________________________________________________________________________________
conv1d_7 (Conv1D)                (None, 2492, 3)       1803        embedding_1[2][0]                
____________________________________________________________________________________________________
conv1d_8 (Conv1D)                (None, 2491, 3)       2703        embedding_1[2][0]                
___________________________________________________________________________________________

<keras.callbacks.History at 0x23b3e52db70>

In [17]:
# # try different optimizer: rmsprop --> not as good
# # parameters
# n_labels = 2
# filter_sizes = [2,3,4]
# n_filters = 3
# n_hidden_units = 128
# batch_size = 128

# # input
# sequence_input = Input(shape=(input_length,), dtype='int32')
# # embedding: vector representation of word
# embedded_sequences = embedding_layer(sequence_input)
# # conv layer
# features = []
# for filter_size in filter_sizes:
#     # conv layer
#     conv = Conv1D(n_filters, filter_size, activation='relu')(embedded_sequences)
#     # global max pooling
#     conv = GlobalMaxPooling1D()(conv)
#     # add features together
#     features.append(conv)
# # penultimate layer
# nn = Concatenate()(features)
# # dropout
# nn = Dropout(0.5)(nn)
# # fully connected layer
# nn = Dense(n_hidden_units, activation='relu')(nn)
# preds = Dense(n_labels, activation='softmax')(nn)

# model = Model(sequence_input, preds)
# model.summary()

# model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

# # happy learning!
# model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=batch_size)

In [31]:
# continue training
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=batch_size)

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x23a6fef04a8>

In [32]:
# predict

# format test data
data = test['review'].tolist()
sequences = tokenizer.texts_to_sequences(data) # list, same len as data. represent word as rank/index
x_test = pad_sequences(sequences, maxlen=input_length)
print('x_test.shape', x_test.shape)

# predict
y_test = model.predict(x_test, batch_size=batch_size)
print('y_test.shape', y_test.shape)

x_test.shape (25000, 2493)
y_test.shape (25000, 2)


In [33]:
# peek into y_test
print(y_test[:10])
print(np.argmax(y_test[:10], axis=1))

[[ 0.2161812   0.78381878]
 [ 0.6965335   0.3034665 ]
 [ 0.42191854  0.57808143]
 [ 0.47578177  0.52421826]
 [ 0.7505579   0.24944213]
 [ 0.32316694  0.67683303]
 [ 0.57733971  0.42266029]
 [ 0.4219445   0.57805556]
 [ 0.87709332  0.12290668]
 [ 0.38422331  0.61577672]]
[1 0 1 1 0 1 0 1 0 1]


In [63]:
import pandas as pd
# format predictions into submission file
submission = pd.DataFrame(
                            np.hstack((
                                np.reshape(np.asarray(test['id']), (-1,1)), 
                                np.reshape(np.argmax(y_test, axis=1), (-1,1))
                            ))
                        , columns=['id', 'sentiment'] )

#  peek into submission df
submission.head()

In [68]:
# export to csv
submission.to_csv(path_or_buf='submission.csv', index=False)