In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import os
import re

In [2]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.models as gsm

## Preprocessing

In [3]:
# Data
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

In [37]:
import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()

In [38]:
def prepare_cvs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, encoding='utf-8', quoting=3)
    df.columns = ['id','text','polarity','class'] # Set up column names
    df = df.iloc[np.random.permutation(len(df))] # Random permutations
    return df

In [39]:
train = np.array(prepare_cvs_data(data_train_path)['text'])

In [41]:
for i in range(train.shape[0]):
    train[i] = tokenize(train[i])

In [43]:
print(train[0])
print(train[1])

<user> hm <elong>. <repeat>you may have a point. <repeat> i thought twitter had got dull 😂. lamination <allcaps>
in geometry today hannah started crying of laughter because ms. canning said 'pp' lol


## Loading glove

In [46]:
GLOVE_DIR = "../tools/"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [51]:
word_index = {}
idx = 0
for i in range(train.shape[0]):
    txt = train[i].split()
    for j in range(len(txt)):
        if word_index.get(txt[j]) == None:
            word_index[txt[j]] = idx
            idx += 1

In [53]:
EMBEDDING_DIM = 200

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [61]:
MAX_SEQUENCE_LENGTH = 1

from keras.layers import Embedding
from keras.models import Model

input_word = Input(shape=(1,))

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(input_word)

embedding_model = Model(input_word, embedding_layer)

In [69]:
embedding_model.predict(np.array([word_index["lol"]]))

array([[[  3.04300010e-01,  -4.71270010e-02,   6.74459990e-03,
          -7.02779964e-02,  -3.83569986e-01,   1.77660003e-01,
          -1.05959997e-01,   1.69620007e-01,  -3.48769993e-01,
           1.12949997e-01,  -2.37619996e-01,   6.17799982e-02,
           1.41269997e-01,   7.85040036e-02,   8.85540023e-02,
           4.05110002e-01,   3.15290004e-01,   9.86569971e-02,
          -1.97559997e-01,  -2.32480004e-01,   2.58150011e-01,
          -1.14050001e-01,   3.36490005e-01,  -1.26430005e-01,
          -2.10720003e-01,   1.36820003e-01,  -7.43329972e-02,
          -2.09999993e-01,  -2.60419995e-01,  -7.05009997e-01,
           1.07000001e-01,   1.70790002e-01,  -2.54469991e-01,
          -1.25090003e-01,  -1.62410006e-01,   5.47500014e-01,
          -1.98040009e-02,  -3.68429989e-01,  -1.01889998e-01,
          -3.07049990e-01,   5.49939990e-01,   3.93170007e-02,
           4.31860000e-01,   1.01510003e-01,  -3.69769990e-01,
           2.38629997e-01,   7.47500002e-01,   3.983699

## Autoencoder

In [71]:
from keras.layers import Input, Dense
from keras.models import Model


encoding_dim = 50

input_word = Input(shape=(200,))

encoded = Dense(encoding_dim, activation='relu')(input_word)

decoded = Dense(200, activation='sigmoid')(encoded)

autoencoder = Model(input_word, decoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [74]:
data = embedding_model.predict(np.array([word_index["lol"]]))[0]
# random, need training
autoencoder.predict(data)

array([[ 0.51459759,  0.49793917,  0.46187755,  0.491568  ,  0.59542143,
         0.55838484,  0.41244352,  0.39204735,  0.57383543,  0.42474365,
         0.44647253,  0.42325181,  0.47955927,  0.51048654,  0.55219638,
         0.46961102,  0.50830984,  0.44755465,  0.53069991,  0.51034915,
         0.46747422,  0.50449568,  0.46408534,  0.45891559,  0.51342642,
         0.48454446,  0.4531787 ,  0.55239803,  0.54824692,  0.544321  ,
         0.48954442,  0.55021352,  0.46595684,  0.54449546,  0.48956317,
         0.51587528,  0.39996573,  0.40330929,  0.48041177,  0.49481162,
         0.50451535,  0.54726148,  0.48243734,  0.56237841,  0.443712  ,
         0.42772862,  0.45498008,  0.53627372,  0.43269598,  0.50044888,
         0.41188896,  0.47039655,  0.42564642,  0.47672057,  0.47178909,
         0.49529257,  0.43980682,  0.51759273,  0.48556325,  0.41049957,
         0.58436543,  0.55790335,  0.54374248,  0.47870111,  0.51101267,
         0.45847374,  0.49126002,  0.52511841,  0.5