In [5]:
%matplotlib inline
from matplotlib import pyplot as plt
from tqdm import tqdm
import numpy as np
import os
import pickle
import json
import cv2
import re

In [6]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

In [33]:
# create giant dictionary for all data
data_dir = 'mmhs150k/'

# load data and print sizes
tweet_dict = json.load(open(data_dir + 'MMHS150K_GT.json', 'r'))
print('Length of Tweet Dictionary:', len(tweet_dict))
print('Number of Images:', len(os.listdir(data_dir + 'img_resized')))
print('Number of Image Texts:', len(os.listdir(data_dir + 'img_txt')))

Length of Tweet Dictionary: 149823
Number of Images: 150000
Number of Image Texts: 59252


In [34]:
# method for cleaning text like in https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
def hashtag(text):
    hashtag_body = text.group()[1:]
    if hashtag_body.isupper(): return "<hashtag> {} ".format(hashtag_body.lower())
    else: return ' '.join(["<hashtag>"] + [re.sub(r"([A-Z])",r" \1", hashtag_body, flags=re.MULTILINE | re.DOTALL)])

def allcaps(text): return text.group().lower() + " <allcaps>"    

def clean_tweet_text(t):
    eyes = r'[8:=;]'
    nose = r"['`\-]?"
    
    t = re.sub(r'https?:\/\/\S+\b|www\.(\w+\.)+\S*', '<url>', t)
    t = re.sub(r'@\w+', '<user>', t)
    t = re.sub(r'{}{}[)dD]+|[)dD]+{}{}'.format(eyes, nose, nose, eyes), '<smile>', t)
    t = re.sub(r'{}{}p+".format(eyes, nose)', '<lolface>', t)
    t = re.sub(r'{}{}\(+|\)+{}{}'.format(eyes, nose, nose, eyes), '<sadface>', t)
    t = re.sub(r'{}{}[\/|l*]'.format(eyes, nose), '<neutralface>', t)
    t = re.sub(r'/', ' / ', t)
    t = re.sub(r'<3','<heart>', t)
    t = re.sub(r'[-+]?[.\d]*[\d]+[:,.\d]*', '<number>', t)
    t = re.sub(r'#\S+', hashtag, t)
    t = re.sub(r'([!?.]){2,}', r'\1 <repeat>', t)
    t = re.sub(r'\b(\S*?)(.)\2{2,}\b', r'\1\2 <elong>', t)
    t = re.sub(r'([A-Z]){2,}', allcaps, t)
    t = re.sub(r'{}'.format(r'[\".,-;&:]'), ' ', t)
    return t.lower()
    
print(clean_tweet_text('@SLAAATTTTT @AINTSHlTLAUGHS NIGGA...  DID YOU NOT HEAR THE CHRIS BROWN SONG?!?!?! https://t.co/1hwQMRczOw'))
print(clean_tweet_text(':) :-) 8) #HelloWorld #helloworld #Hello'))

<user> <user> nigga <allcaps>  <repeat>  did <allcaps> you <allcaps> not <allcaps> hear <allcaps> the <allcaps> chris <allcaps> brown <allcaps> song <allcaps>! <repeat> <url>
<smile> <smile> <smile> <hashtag>  hello world <hashtag> helloworld <hashtag>  hello


In [35]:
# initialize data dictionary {id: (tweet text, label)}
train_dict = dict()
for id in open(data_dir + '/splits/train_ids.txt', 'r').read().splitlines():
    
    # process text (tweet special tokens)
    text = tweet_dict[id]['tweet_text']
    text = clean_tweet_text(text)
    
    # get majority vote label
    binary_labels = [1 if n > 0 else 0 for n in tweet_dict[id]['labels']]
    label = 1 if sum(binary_labels)/len(tweet_dict[id]['labels']) > 0.5 else 0
    
    # save to dictionary
    train_dict[id] = (text, label)