In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import os
import re

In [2]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.models as gsm

## Preprocessing

In [3]:
# Data
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

In [4]:
import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()

In [5]:
def prepare_cvs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, encoding='utf-8', quoting=3)
    df.columns = ['id','text','polarity','class'] # Set up column names
    return df

In [6]:
train = np.array(prepare_cvs_data(data_train_path)['text'])
train_type = np.array(prepare_cvs_data(data_train_path)['polarity'])
train_labels_tmp = prepare_cvs_data(data_train_path)['class']
train_labels = np.array([int(x[0]) for x in train_labels_tmp])

test = np.array(prepare_cvs_data(data_test_path)['text'])
test_type = np.array(prepare_cvs_data(data_test_path)['polarity'])
test_labels_tmp = prepare_cvs_data(data_test_path)['class']
test_labels = np.array([int(x[0]) for x in test_labels_tmp])

In [7]:
for i in range(train.shape[0]):
    train[i] = tokenize(train[i])

In [8]:
print(train[0], train_labels[0], train_type[0])
print(train[1], train_labels[1])
print(train[2], train_labels[2])
print(train[3], train_labels[3])

how the fu*k! who the heck! moved my fridge. <repeat> should i knock the landlord door. <hashtag> angry <hashtag> mad <hashtag> # 3 anger
so my indian uber driver just called someone the n word. if i wasn't in a moving vehicle i'd have jumped out <hashtag> disgusted  3
<user> i asked for my parcel to be delivered to a pick up store not my address <hashtag> fuming <hashtag> poorcustomerservice 3
so ef whichever butt wipe pulled the fire alarm in davis bc i was sound asleep <hashtag> pissed <hashtag> angry <hashtag> upset <hashtag> tired <hashtag> sad <hashtag> tired <hashtag> hangry <hashtag> ##### 3


## Loading glove

In [9]:
GLOVE_DIR = "../tools/"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [10]:
word_index = {}
idx = 0
for i in range(train.shape[0]):
    txt = train[i].split()
    for j in range(len(txt)):
        if word_index.get(txt[j]) == None:
            word_index[txt[j]] = idx
            idx += 1

for i in range(test.shape[0]):
    txt = test[i].split()
    for j in range(len(txt)):
        if word_index.get(txt[j]) == None:
            word_index[txt[j]] = idx
            idx += 1

In [11]:
EMBEDDING_DIM = 200

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [12]:
MAX_SEQUENCE_LENGTH = 1

from keras.layers import Embedding
from keras.layers import Input
from keras.models import Model

input_word = Input(shape=(1,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(input_word)

embedding_model = Model(input_word, embedding_layer)

Using TensorFlow backend.
  return f(*args, **kwds)


In [13]:
embedding_model.predict(np.array([word_index["lol"]]))

array([[[  3.04300010e-01,  -4.71270010e-02,   6.74459990e-03,
          -7.02779964e-02,  -3.83569986e-01,   1.77660003e-01,
          -1.05959997e-01,   1.69620007e-01,  -3.48769993e-01,
           1.12949997e-01,  -2.37619996e-01,   6.17799982e-02,
           1.41269997e-01,   7.85040036e-02,   8.85540023e-02,
           4.05110002e-01,   3.15290004e-01,   9.86569971e-02,
          -1.97559997e-01,  -2.32480004e-01,   2.58150011e-01,
          -1.14050001e-01,   3.36490005e-01,  -1.26430005e-01,
          -2.10720003e-01,   1.36820003e-01,  -7.43329972e-02,
          -2.09999993e-01,  -2.60419995e-01,  -7.05009997e-01,
           1.07000001e-01,   1.70790002e-01,  -2.54469991e-01,
          -1.25090003e-01,  -1.62410006e-01,   5.47500014e-01,
          -1.98040009e-02,  -3.68429989e-01,  -1.01889998e-01,
          -3.07049990e-01,   5.49939990e-01,   3.93170007e-02,
           4.31860000e-01,   1.01510003e-01,  -3.69769990e-01,
           2.38629997e-01,   7.47500002e-01,   3.983699

## Autoencoder

In [14]:
from keras.layers import Input, Dense
from keras.models import Model


encoding_dim = 50

input_word = Input(shape=(1,200,))

encoded = Dense(encoding_dim, activation='relu')(input_word)

decoded = Dense(200, activation='sigmoid')(encoded)

autoencoder = Model(input_word, decoded)
autoencoder.compile(optimizer='adadelta', loss='mse')

In [15]:
hist = autoencoder.fit(embedding_matrix.reshape((21215, 1, 200)), embedding_matrix.reshape((21215, 1, 200)), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


## Combine models

In [16]:
encoding_dim = 50

input_word = Input(shape=(1,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(input_word)

encoded = Dense(encoding_dim, activation='relu', weights=autoencoder.layers[1].get_weights())(embedding_layer)
decoded = Dense(200, activation='sigmoid', weights=autoencoder.layers[2].get_weights())(encoded)
final = Model(input_word, decoded)
final.compile(optimizer='adadelta', loss='mse')

In [17]:
final.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1, 200)            4243000   
_________________________________________________________________
dense_3 (Dense)              (None, 1, 50)             10050     
_________________________________________________________________
dense_4 (Dense)              (None, 1, 200)            10200     
Total params: 4,263,250
Trainable params: 20,250
Non-trainable params: 4,243,000
_________________________________________________________________


In [18]:
emb = embedding_model.predict(np.array(word_index["lol"]).reshape((1,1)))
final.predict(np.array([word_index["lol"]])) == autoencoder.predict(emb)

array([[[ True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,

## GRU

In [19]:
#Fetching encoder from autoencoder model
input = autoencoder.layers[0].input
output = autoencoder.layers[-2].output
encoder = Model(input=input, output=output)

#Retrieving the longest sequence size
#seq_dim = max([len(l.rsplit()) for l in train])
#seq_dim = max(seq_dim, max([len(l.rsplit()) for l in test]))
seq_dim = 50
print('The longest sequence is', seq_dim)

#Returns a list of np arrays with encoded values of each word
def get_encoded_list(tweet):
    ans = np.zeros((seq_dim, 1))
    for i, w in enumerate(tweet.rsplit()):
        emb = word_index[w]
        ans[i, 0] = emb
    return ans

#Preparing the data for the GRU
train_gru = np.empty((len(train), seq_dim, 1))
test_gru = np.empty((len(test), seq_dim, 1))

for i, tweet in enumerate(train):
    train_gru[i, :, :] = get_encoded_list(tweet)
    
for i, tweet in enumerate(test):
    test_gru[i, :, :] = get_encoded_list(tweet)

The longest sequence is 50


  after removing the cwd from sys.path.


In [20]:
train_gru.shape

(7102, 50, 1)

In [21]:
test_gru.shape

(800, 50, 1)

In [49]:
from keras.layers import Embedding, Dropout, GRU, LSTM, concatenate, RepeatVector

encoding_dim = 50
input_size = 50

input_seq = Input(shape=(50,1))
input_type = Input(shape=(4,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=input_size,
                            trainable=False)(input_seq)

encoded = Dense(encoding_dim, activation='relu', weights=autoencoder.layers[1].get_weights(), trainable=False)(embedding_layer)

repeat = RepeatVector(50)(input_type)
concat = concatenate([encoded, repeat])

rec = LSTM(128, input_shape=(None, 50), dropout=0., recurrent_dropout=0., go_backwards=True)(concat)

out = Dense(4, activation="softmax")(rec)

final = Model([input_seq, input_type], out)
final.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=["accuracy"])

In [50]:
final.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 50, 1)        0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 50, 200)      4243000     input_14[0][0]                   
__________________________________________________________________________________________________
input_15 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
dense_15 (Dense)                (None, 50, 50)       10050       embedding_8[0][0]                
__________________________________________________________________________________________________
repeat_vec

In [51]:
emot_dic = {"anger":0, "joy":1, "sadness":2, "fear":3}

train_type_int = np.array([emot_dic[x] for x in train_type])
train_type_oh = np.zeros((train_type_int.shape[0], 4))
for i in range(train_type_int.shape[0]):
    train_type_oh[i, train_type_int[i]] = 1
    
test_type_int = np.array([emot_dic[x] for x in test_type])
test_type_oh = np.zeros((test_type_int.shape[0], 4))
for i in range(test_type_int.shape[0]):
    test_type_oh[i, test_type_int[i]] = 1

In [52]:
train_labels_oh = np.zeros((train_labels.shape[0], 4))
for i in range(train_labels.shape[0]):
    train_labels_oh[i, train_labels[i]] = 1
    
test_labels_oh = np.zeros((test_labels.shape[0], 4))
for i in range(test_labels.shape[0]):
    test_labels_oh[i, test_labels[i]] = 1

In [53]:
final.fit([train_gru, train_type_oh], train_labels_oh,
                epochs=30,
                batch_size=8,
                shuffle=True,
                validation_data=([test_gru, test_type_oh], test_labels_oh),
                callbacks=[TensorBoard(log_dir='/tmp/lstm', histogram_freq=1, write_graph=False),
                          ModelCheckpoint('/tmp/chackpoint.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)])

Train on 7102 samples, validate on 800 samples
Epoch 1/100

InvalidArgumentError: You must feed a value for placeholder tensor 'input_8' with dtype float and shape [?,50,1]
	 [[Node: input_8 = Placeholder[dtype=DT_FLOAT, shape=[?,50,1], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'input_8', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-36-dcfb9e58a7d9>", line 6, in <module>
    input_seq = Input(shape=(50,1))
  File "/usr/lib/python3.6/site-packages/keras/engine/topology.py", line 1439, in Input
    input_tensor=tensor)
  File "/usr/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/keras/engine/topology.py", line 1348, in __init__
    name=self.name)
  File "/usr/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 492, in placeholder
    x = tf.placeholder(dtype, shape=shape, name=name)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1599, in placeholder
    return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3091, in _placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/usr/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'input_8' with dtype float and shape [?,50,1]
	 [[Node: input_8 = Placeholder[dtype=DT_FLOAT, shape=[?,50,1], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [None]:
from keras.callbacks import TensorBoard, ModelCheckpoint

In [None]:
final.fit([train_gru, train_type_oh], train_labels_oh,
                epochs=30,
                batch_size=8,
                shuffle=True,
                validation_data=([test_gru, test_type_oh], test_labels_oh))

Train on 7102 samples, validate on 800 samples
Epoch 1/30
Epoch 2/30