In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import os
import re

## Preprocessing

Data is loaded from the data set and text is pre-processed according to the strategy described in the report.

In [None]:
# Data
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

In [None]:
import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()

In [None]:
def prepare_cvs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, encoding='utf-8', quoting=3)
    df.columns = ['id','text','polarity','class'] # Set up column names
    return df

In [None]:
train = np.array(prepare_cvs_data(data_train_path)['text'])
train_type = np.array(prepare_cvs_data(data_train_path)['polarity'])
train_labels_tmp = prepare_cvs_data(data_train_path)['class']
train_labels = np.array([int(x[0]) for x in train_labels_tmp])

test = np.array(prepare_cvs_data(data_test_path)['text'])
test_type = np.array(prepare_cvs_data(data_test_path)['polarity'])
test_labels_tmp = prepare_cvs_data(data_test_path)['class']
test_labels = np.array([int(x[0]) for x in test_labels_tmp])

In [None]:
for i in range(train.shape[0]):
    train[i] = tokenize(train[i])
for i in range(test.shape[0]):
    test[i] = tokenize(test[i])

In [None]:
print(train[0], train_labels[0], train_type[0])
print(train[1], train_labels[1])
print(train[2], train_labels[2])
print(train[3], train_labels[3])

## Loading glove

glove embeddings are loaded and we create an embedding matrix containing a matrix of all required embedding. The dict word_index maps words -> index in the embedding matrix.

In [None]:
GLOVE_DIR = "glove/"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
word_index = {}
idx = 0
for i in range(train.shape[0]):
    txt = train[i].split()
    for j in range(len(txt)):
        if word_index.get(txt[j]) == None:
            word_index[txt[j]] = idx
            idx += 1

for i in range(test.shape[0]):
    txt = test[i].split()
    for j in range(len(txt)):
        if word_index.get(txt[j]) == None:
            word_index[txt[j]] = idx
            idx += 1

In [None]:
EMBEDDING_DIM = 200

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
MAX_SEQUENCE_LENGTH = 1

from keras.layers import Embedding
from keras.layers import Input
from keras.models import Model

input_word = Input(shape=(1,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(input_word)

embedding_model = Model(input_word, embedding_layer)

## Autoencoder

A simple auto encoder is used to reduce dimensionality of the input.

In [None]:
from keras.layers import Input, Dense
from keras.models import Model


encoding_dim = 50

input_word = Input(shape=(1,200,))

encoded = Dense(encoding_dim, activation='relu')(input_word)

decoded = Dense(200, activation='sigmoid')(encoded)

autoencoder = Model(input_word, decoded)
autoencoder.compile(optimizer='adadelta', loss='mse')

In [None]:
hist = autoencoder.fit(embedding_matrix.reshape((19628, 1, 200)), embedding_matrix.reshape((19628, 1, 200)), epochs=100)

## LSTM

We prepare the sequence with a maximum length of 50. Empty words are replaced by zeroes.
Checkpoints are saved and the one with the best validation accuracy is conserved.

In [None]:
#Retrieving the longest sequence size
#seq_dim = max([len(l.rsplit()) for l in train])
#seq_dim = max(seq_dim, max([len(l.rsplit()) for l in test]))
seq_dim = 50
print('The longest sequence is', seq_dim)

#Returns a list of np arrays with encoded values of each word
def get_encoded_list(tweet):
    ans = np.zeros((seq_dim, 1))
    for i, w in enumerate(tweet.rsplit()):
        emb = word_index[w]
        ans[i, 0] = emb
    return ans

#Preparing the data for the GRU
train_gru = np.empty((len(train), seq_dim, 1))
test_gru = np.empty((len(test), seq_dim, 1))

for i, tweet in enumerate(train):
    train_gru[i, :, :] = get_encoded_list(tweet)
    
for i, tweet in enumerate(test):
    test_gru[i, :, :] = get_encoded_list(tweet)

In [None]:
from keras.layers import Embedding, Dropout, GRU, LSTM, concatenate, RepeatVector

encoding_dim = 50
input_size = 50

input_seq = Input(shape=(50,1))
input_type = Input(shape=(4,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=input_size,
                            trainable=False)(input_seq)

encoded = Dense(encoding_dim, activation='relu', weights=autoencoder.layers[1].get_weights(), trainable=False)(embedding_layer)

repeat = RepeatVector(50)(input_type)
concat = concatenate([encoded, repeat])

rec = LSTM(128, input_shape=(None, 50), dropout=0.1, recurrent_dropout=0., go_backwards=True)(concat)

out = Dense(4, activation="softmax")(rec)

final = Model([input_seq, input_type], out)
final.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=["accuracy"])

In [None]:
final.summary()

In [None]:
emot_dic = {"anger":0, "joy":1, "sadness":2, "fear":3}

train_type_int = np.array([emot_dic[x] for x in train_type])
train_type_oh = np.zeros((train_type_int.shape[0], 4))
for i in range(train_type_int.shape[0]):
    train_type_oh[i, train_type_int[i]] = 1
    
test_type_int = np.array([emot_dic[x] for x in test_type])
test_type_oh = np.zeros((test_type_int.shape[0], 4))
for i in range(test_type_int.shape[0]):
    test_type_oh[i, test_type_int[i]] = 1

In [None]:
train_labels_oh = np.zeros((train_labels.shape[0], 4))
for i in range(train_labels.shape[0]):
    train_labels_oh[i, train_labels[i]] = 1
    
test_labels_oh = np.zeros((test_labels.shape[0], 4))
for i in range(test_labels.shape[0]):
    test_labels_oh[i, test_labels[i]] = 1

In [None]:
from keras.callbacks import TensorBoard, ModelCheckpoint

In [None]:
final.fit([train_gru, train_type_oh], train_labels_oh,
                epochs=50,
                batch_size=8,
                shuffle=True,
                validation_data=([test_gru, test_type_oh], test_labels_oh),
                callbacks=[TensorBoard(log_dir='/tmp/encoder', histogram_freq=1, write_graph=False),
                            ModelCheckpoint('/tmp/checkpoint.h5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)])