In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import os
import re

In [2]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.models as gsm

## Preprocessing

In [3]:
# Data
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

In [5]:
import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()

In [6]:
def prepare_cvs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, encoding='utf-8', quoting=3)
    df.columns = ['id','text','polarity','class'] # Set up column names
    df = df.iloc[np.random.permutation(len(df))] # Random permutations
    return df

In [7]:
train = np.array(prepare_cvs_data(data_train_path)['text'])

In [8]:
for i in range(train.shape[0]):
    train[i] = tokenize(train[i])

In [11]:
print(train[6])
print(train[7])
print(train[8])
print(train[9])

<user> hi monica, i write regularly for <user>  - but not on bees - never dared try them <hashtag> buzz  <hashtag> honey <allcaps> <allcaps>
<user> how do u guys determine teams? cause i'm <number>% on shitty teams when i play and i'm fuckin over it <hashtag> cod 
jk i'm not a loner just really shy 😅
it was an <hashtag> amazing <hashtag> start to the first <hashtag> fall  day. will it be an <hashtag> indiansummer


## Loading glove

In [12]:
GLOVE_DIR = "../tools/"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [13]:
word_index = {}
idx = 0
for i in range(train.shape[0]):
    txt = train[i].split()
    for j in range(len(txt)):
        if word_index.get(txt[j]) == None:
            word_index[txt[j]] = idx
            idx += 1

In [14]:
EMBEDDING_DIM = 200

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [16]:
MAX_SEQUENCE_LENGTH = 1

from keras.layers import Embedding
from keras.layers import Input
from keras.models import Model

input_word = Input(shape=(1,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(input_word)

embedding_model = Model(input_word, embedding_layer)

In [17]:
embedding_model.predict(np.array([word_index["lol"]]))

array([[[  3.04300010e-01,  -4.71270010e-02,   6.74459990e-03,
          -7.02779964e-02,  -3.83569986e-01,   1.77660003e-01,
          -1.05959997e-01,   1.69620007e-01,  -3.48769993e-01,
           1.12949997e-01,  -2.37619996e-01,   6.17799982e-02,
           1.41269997e-01,   7.85040036e-02,   8.85540023e-02,
           4.05110002e-01,   3.15290004e-01,   9.86569971e-02,
          -1.97559997e-01,  -2.32480004e-01,   2.58150011e-01,
          -1.14050001e-01,   3.36490005e-01,  -1.26430005e-01,
          -2.10720003e-01,   1.36820003e-01,  -7.43329972e-02,
          -2.09999993e-01,  -2.60419995e-01,  -7.05009997e-01,
           1.07000001e-01,   1.70790002e-01,  -2.54469991e-01,
          -1.25090003e-01,  -1.62410006e-01,   5.47500014e-01,
          -1.98040009e-02,  -3.68429989e-01,  -1.01889998e-01,
          -3.07049990e-01,   5.49939990e-01,   3.93170007e-02,
           4.31860000e-01,   1.01510003e-01,  -3.69769990e-01,
           2.38629997e-01,   7.47500002e-01,   3.983699

## Autoencoder

In [24]:
from keras.layers import Input, Dense
from keras.models import Model


encoding_dim = 50

input_word = Input(shape=(200,))

encoded = Dense(encoding_dim, activation='relu')(input_word)

decoded = Dense(200, activation='sigmoid')(encoded)

autoencoder = Model(input_word, decoded)
autoencoder.compile(optimizer='adadelta', loss='mse')

In [48]:
hist = autoencoder.fit(embedding_matrix, embedding_matrix, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
embedding_matrix[0]

array([  3.15530002e-01,   5.37649989e-01,   1.01769999e-01,
         3.25529985e-02,   3.79800005e-03,   1.53639996e-02,
        -2.03439996e-01,   3.32940012e-01,  -2.08859995e-01,
         1.00610003e-01,   3.09760004e-01,   5.00150025e-01,
         3.20179999e-01,   1.35370001e-01,   8.70389957e-03,
         1.91100001e-01,   2.46680006e-01,  -6.07520007e-02,
        -4.36230004e-01,   1.93019994e-02,   5.99720001e-01,
         1.34440005e-01,   1.28009999e-02,  -5.40520012e-01,
         2.73869991e-01,  -1.18200004e+00,  -2.76769996e-01,
         1.12790003e-01,   4.65959996e-01,  -9.06850025e-02,
         2.42530003e-01,   1.56540006e-01,  -2.36179993e-01,
         5.76940000e-01,   1.75630003e-01,  -1.96899995e-02,
         1.82949994e-02,   3.75690013e-01,  -4.19840008e-01,
         2.26129994e-01,  -2.04380006e-01,  -7.62490034e-02,
         4.03560013e-01,   6.15819991e-01,  -1.00639999e-01,
         2.33180001e-01,   2.28080004e-01,   3.45759988e-01,
        -1.46270007e-01,

In [50]:
autoencoder.predict([embedding_matrix[0:1]]) - embedding_matrix[0]

array([[ -3.15401364e-01,  -2.99176231e-01,  -8.86084940e-02,
          2.27072090e-02,   1.46797695e-02,   2.38770572e-02,
          2.45026518e-01,  -3.32931389e-01,   2.09002468e-01,
         -1.00558333e-01,  -3.09739960e-01,  -4.99935738e-01,
         -3.20137475e-01,  -1.35193380e-01,   1.79614136e-02,
         -1.91076897e-01,  -2.14887466e-01,   6.08387757e-02,
          4.40361919e-01,   9.33432765e-02,  -3.43206972e-01,
         -1.28536777e-01,  -1.27118815e-02,   5.40953303e-01,
         -2.73828950e-01,   1.18204380e+00,   3.25802639e-01,
         -9.95560903e-02,  -4.65848305e-01,   9.33343249e-02,
         -1.91309206e-01,  -8.72857645e-02,   2.40347177e-01,
         -5.76926002e-01,  -1.75534747e-01,   1.97469425e-02,
         -3.97179089e-03,  -3.57034484e-01,   4.20622549e-01,
         -2.25865327e-01,   2.12812868e-01,   1.22906193e-01,
         -2.92571880e-01,  -6.15744560e-01,   1.15326579e-01,
         -2.21736034e-01,  -2.27980899e-01,  -3.45618730e-01,
        

In [34]:
np.allclose([embedding_matrix[0:1]], embedding_matrix[0])

True