In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import os
import re

In [2]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.models as gsm

## Preprocessing

In [3]:
# Data
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

In [4]:
import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()

In [5]:
def prepare_cvs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, encoding='utf-8', quoting=3)
    df.columns = ['id','text','polarity','class'] # Set up column names
    df = df.iloc[np.random.permutation(len(df))] # Random permutations
    return df

In [6]:
train = np.array(prepare_cvs_data(data_train_path)['text'])

In [7]:
for i in range(train.shape[0]):
    train[i] = tokenize(train[i])

In [8]:
print(train[6])
print(train[7])
print(train[8])
print(train[9])

my ukulele bag has fallen apart. 😐 wel <allcaps> <elong> at <allcaps> least <allcaps> my life hasn't yet! <repeat> 
that feel when you travel <number> miles to pick up a form that arrives in the post two days after you leave. <hashtag> fume
leave it on there, rule,nimber <number> of carpet cleaning! <repeat>  <hashtag> worsethananatomicbomb <hashtag> accidentlyspillbeeronthecarpet
and im not even going to get into how its discriminatory to several religions which mandate its followers to let their hair dread.


## Loading glove

In [9]:
GLOVE_DIR = "../tools/"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [10]:
word_index = {}
idx = 0
for i in range(train.shape[0]):
    txt = train[i].split()
    for j in range(len(txt)):
        if word_index.get(txt[j]) == None:
            word_index[txt[j]] = idx
            idx += 1

In [11]:
EMBEDDING_DIM = 200

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [12]:
MAX_SEQUENCE_LENGTH = 1

from keras.layers import Embedding
from keras.layers import Input
from keras.models import Model

input_word = Input(shape=(1,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(input_word)

embedding_model = Model(input_word, embedding_layer)

Using TensorFlow backend.
  return f(*args, **kwds)


In [13]:
embedding_model.predict(np.array([word_index["lol"]]))

array([[[  3.04300010e-01,  -4.71270010e-02,   6.74459990e-03,
          -7.02779964e-02,  -3.83569986e-01,   1.77660003e-01,
          -1.05959997e-01,   1.69620007e-01,  -3.48769993e-01,
           1.12949997e-01,  -2.37619996e-01,   6.17799982e-02,
           1.41269997e-01,   7.85040036e-02,   8.85540023e-02,
           4.05110002e-01,   3.15290004e-01,   9.86569971e-02,
          -1.97559997e-01,  -2.32480004e-01,   2.58150011e-01,
          -1.14050001e-01,   3.36490005e-01,  -1.26430005e-01,
          -2.10720003e-01,   1.36820003e-01,  -7.43329972e-02,
          -2.09999993e-01,  -2.60419995e-01,  -7.05009997e-01,
           1.07000001e-01,   1.70790002e-01,  -2.54469991e-01,
          -1.25090003e-01,  -1.62410006e-01,   5.47500014e-01,
          -1.98040009e-02,  -3.68429989e-01,  -1.01889998e-01,
          -3.07049990e-01,   5.49939990e-01,   3.93170007e-02,
           4.31860000e-01,   1.01510003e-01,  -3.69769990e-01,
           2.38629997e-01,   7.47500002e-01,   3.983699

## Autoencoder

In [58]:
from keras.layers import Input, Dense
from keras.models import Model


encoding_dim = 50

input_word = Input(shape=(1,200,))

encoded = Dense(encoding_dim, activation='relu')(input_word)

decoded = Dense(200, activation='sigmoid')(encoded)

autoencoder = Model(input_word, decoded)
autoencoder.compile(optimizer='adadelta', loss='mse')

In [59]:
hist = autoencoder.fit(embedding_matrix.reshape((17873, 1, 200)), embedding_matrix.reshape((17873, 1, 200)), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [60]:
autoencoder.predict(embedding_matrix[0:1].reshape((1, 1, 200))) - embedding_matrix[0]

array([[[  2.04475434e-03,  -1.33484024e-01,   3.18760016e-01,
           2.14834749e-01,   6.54322816e-01,  -2.40243984e-01,
          -2.30437815e-02,  -2.71960303e-01,   1.28590502e-01,
          -1.06900266e-01,   1.90989563e-01,  -6.98941241e-01,
           2.24763477e-01,  -1.14838890e-01,  -7.25639147e-01,
           2.34655760e-01,  -2.69377628e-01,   4.80826816e-03,
           1.62817781e-01,   8.74007936e-04,   1.59049063e-01,
           3.48254954e-01,  -1.55941404e-01,   2.90236896e-01,
           4.05554926e-02,   3.23010445e-01,  -7.00435654e-01,
          -3.02569648e-01,  -2.13158514e-01,   2.42166119e-01,
           3.51775872e-01,   1.23402816e-01,   2.90700611e-01,
          -3.36228250e-01,   1.86682913e-01,  -4.24086056e-02,
          -1.48702512e-01,  -1.19113829e-03,  -1.50155470e-01,
          -1.14264065e-01,  -2.32804179e-01,   9.64938852e-02,
           1.89713167e-01,   1.96531648e-01,   4.45589712e-01,
          -4.09117211e-02,  -4.47520201e-01,   3.122017

## Combine models

In [61]:
encoding_dim = 50

input_word = Input(shape=(1,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(input_word)

encoded = Dense(encoding_dim, activation='relu', weights=autoencoder.layers[1].get_weights())(embedding_layer)
decoded = Dense(200, activation='sigmoid', weights=autoencoder.layers[2].get_weights())(encoded)
final = Model(input_word, decoded)
final.compile(optimizer='adadelta', loss='mse')

In [62]:
final.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 1)                 0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 1, 200)            3574600   
_________________________________________________________________
dense_19 (Dense)             (None, 1, 50)             10050     
_________________________________________________________________
dense_20 (Dense)             (None, 1, 200)            10200     
Total params: 3,594,850
Trainable params: 20,250
Non-trainable params: 3,574,600
_________________________________________________________________


In [63]:
final.predict(np.array([word_index["lol"]]))

array([[[  5.37616506e-05,   4.28096595e-04,   9.93819995e-05,
           1.13859377e-03,   1.18994867e-04,   7.91068946e-04,
           5.26164353e-01,   9.14681732e-05,   9.78201933e-05,
           5.29125027e-05,   8.67918061e-05,   1.48461448e-04,
           3.79188932e-05,   1.27231557e-04,   1.36072791e-04,
           2.65420513e-05,   2.71935947e-04,   1.25900973e-04,
           7.12620676e-05,   1.53685731e-04,   4.28253959e-04,
           1.88287799e-04,   3.02218425e-04,   1.61238786e-04,
           2.58640775e-05,   3.14008564e-01,   6.65003201e-04,
           2.14176904e-03,   5.45555522e-05,   1.75308756e-04,
           6.11897267e-04,   3.53807409e-04,   3.00749089e-04,
           1.17698903e-04,   3.72291070e-05,   7.86493329e-05,
           2.15003936e-04,   2.45921983e-04,   6.93034334e-03,
           9.44120984e-05,   1.00382961e-01,   2.55214138e-04,
           2.08360632e-03,   6.25386238e-05,   1.30469372e-04,
           2.86007795e-04,   8.39940330e-05,   1.636242