In [5]:
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
import os
import pickle
import json
import cv2
import re

In [6]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

In [33]:
# create giant dictionary for all data
data_dir = 'mmhs150k/'

# load data and print sizes
tweet_dict = json.load(open(data_dir + 'MMHS150K_GT.json', 'r'))
print('Length of Tweet Dictionary:', len(tweet_dict))
print('Number of Images:', len(os.listdir(data_dir + 'img_resized')))
print('Number of Image Texts:', len(os.listdir(data_dir + 'img_txt')))

Length of Tweet Dictionary: 149823
Number of Images: 150000
Number of Image Texts: 59252


In [36]:
# method for cleaning text like in https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
def hashtag(text):
    hashtag_body = text.group()[1:]
    if hashtag_body.isupper(): return "<hashtag> {} ".format(hashtag_body.lower())
    else: return ' '.join(["<hashtag>"] + [re.sub(r"([A-Z])",r" \1", hashtag_body, flags=re.MULTILINE | re.DOTALL)])

def allcaps(text): return text.group().lower() + ' <allcaps> '    

def clean_tweet_text(t):
    eyes = r'[8:=;]'
    nose = r"['`\-]?"
    
    t = re.sub(r'https?:\/\/\S+\b|www\.(\w+\.)+\S*', '<url>', t)
    t = re.sub(r'@\w+', '<user>', t)
    t = re.sub(r'{}{}[)dD]+|[)dD]+{}{}'.format(eyes, nose, nose, eyes), '<smile>', t)
    t = re.sub(r'{}{}p+".format(eyes, nose)', '<lolface>', t)
    t = re.sub(r'{}{}\(+|\)+{}{}'.format(eyes, nose, nose, eyes), '<sadface>', t)
    t = re.sub(r'{}{}[\/|l*]'.format(eyes, nose), '<neutralface>', t)
    t = re.sub(r'/', ' / ', t)
    t = re.sub(r'<3','<heart>', t)
    t = re.sub(r'[-+]?[.\d]*[\d]+[:,.\d]*', '<number>', t)
    t = re.sub(r'#\S+', hashtag, t)
    t = re.sub(r'([!?.]){2,}', r'\1 <repeat>', t)
    t = re.sub(r'\b(\S*?)(.)\2{2,}\b', r'\1\2 <elong>', t)
    t = re.sub(r'([A-Z]){2,}', allcaps, t)
    t = re.sub(r'{}'.format(r'[\".,-;&:]'), ' ', t)
    return t.lower()
    
print(clean_tweet_text('@SLAAATTTTT @AINTSHlTLAUGHS NIGGA...  DID YOU NOT HEAR THE CHRIS BROWN SONG?!?!?! https://t.co/1hwQMRczOw'))
print(clean_tweet_text(':) :-) 8) #HelloWorld #helloworld #Hello'))

<user> <user> nigga <allcaps>   <repeat>  did <allcaps>  you <allcaps>  not <allcaps>  hear <allcaps>  the <allcaps>  chris <allcaps>  brown <allcaps>  song <allcaps> ! <repeat> <url>
<smile> <smile> <smile> <hashtag>  hello world <hashtag> helloworld <hashtag>  hello


In [55]:
# initialize data dictionary {id: (tweet text, label)}
train_data = []
word_index = dict() # dictionary mapping word to index

def get_data_list(path):
    data = []
    for id in open(data_dir + path, 'r').read().splitlines():

        # process text (tweet special tokens)
        text = tweet_dict[id]['tweet_text']
        text = clean_tweet_text(text)
        for word in text.split():
            if word not in word_index: word_index[word] = len(word_index)

        # get majority vote label
        binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
        label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0

        # save to list
        data.append((text, label))

    return data
    
train_data = get_data_list('splits/train_ids.txt')
val_data = get_data_list('splits/val_ids.txt')
print('Train data len:', len(train_data))
print('Val data len:', len(val_data))

Train data len: 134823
Val data len: 5000


In [64]:
# make the dataset
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_NUMBER_OF_WORDS = 1000
MAX_SEQ_LEN = 100

# training
texts, labels = zip(*train_data)

tokenizer = Tokenizer(nb_words=MAX_NUMBER_OF_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQ_LEN)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

x_train = data
y_train = labels

# validation
texts, labels = zip(*val_data)

tokenizer = Tokenizer(nb_words=MAX_NUMBER_OF_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQ_LEN)
labels = np.asarray(labels)

x_val = data
y_val = labels

Found 64960 unique tokens.
Shape of data tensor: (134823, 100)
Shape of label tensor: (134823,)


In [67]:
# embedding layer
from keras.layers import Embedding
EMBEDDING_DIM = 100

# map word to embedding
embeddings_index = {}
for line in open(os.path.join('glove', 'glove.twitter.27B.100d.txt')):
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')

# create embedding matrix (words without embeddings get zero embeddings)
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LEN,
                            trainable=False)