In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from tep.dataLoader import DataLoader
dl = DataLoader()

In [3]:
# load tweets from file
tweets = dl.load_from_file(filename="data/tweets_1.json", ignore_retweets=True)
len(tweets)

284701

In [4]:
tweets += dl.load_from_file(filename="data/tweets_2.json", ignore_retweets=True)
len(tweets)

565486

In [5]:
tweets += dl.load_from_file(filename="data/tweets_3.json", ignore_retweets=True)
len(tweets)

781721

In [6]:
tweets += dl.load_from_file(filename="data/tweets_4.json", ignore_retweets=True)
len(tweets)

1031077

In [7]:
tweets += dl.load_from_file(filename="data/tweets_5.json", ignore_retweets=True)
len(tweets)

1293005

In [8]:
tweets[:10]

[Status(ID=979429522687762432, ScreenName=jack, Created=Thu Mar 29 18:46:09 +0000 2018, Text='Something we’ve been wanting for a while: choose where live video should start playing and tweet it https://t.co/0KstH8TcPZ'),
 Status(ID=979169950169317381, ScreenName=jack, Created=Thu Mar 29 01:34:42 +0000 2018, Text='@freialobo I’d never decline your meeting'),
 Status(ID=979100832535736320, ScreenName=jack, Created=Wed Mar 28 21:00:03 +0000 2018, Text='@tperzyk Thank you so much Tim!'),
 Status(ID=979026218027597824, ScreenName=jack, Created=Wed Mar 28 16:03:34 +0000 2018, Text='Full discussion with Maryam @TopekaKSam and @ShakaSenghor now available 👇🏼 https://t.co/gfbkmpJeja'),
 Status(ID=978818682062225408, ScreenName=jack, Created=Wed Mar 28 02:18:54 +0000 2018, Text='@chrissyteigen ✋🏼🤚🏼'),
 Status(ID=978704810382888966, ScreenName=jack, Created=Tue Mar 27 18:46:24 +0000 2018, Text='@LaMonicaBuzz This is a move against scams and frauds we’re seeing broadly, not an judgement of crypto c

In [9]:
# randomize tweet order
import random
random.seed(1000)
random.shuffle(tweets)

In [10]:
tweets[:10]

[Status(ID=771349282981089280, ScreenName=StJude, Created=Thu Sep 01 14:09:34 +0000 2016, Text='@NastiaLiukin Thanks for your support this September! We appreciate all you do for the kids of #StJude!'),
 Status(ID=930296709514776577, ScreenName=ChrisStirewalt, Created=Tue Nov 14 04:49:54 +0000 2017, Text='@mschlapp @ShannonBream Won’t last...'),
 Status(ID=935706161050894336, ScreenName=om, Created=Wed Nov 29 03:05:08 +0000 2017, Text='@emilyolson @roblafave Congrats 👌🏼 she is adorable and amazingly chill. Wishing her a beautiful life of joy and success'),
 Status(ID=967160020470202370, ScreenName=WesleyLowery, Created=Fri Feb 23 22:11:32 +0000 2018, Text='@AllanRMorton so sorry to hear the pain you went through after that shooting - and hopefully we can figure out how… https://t.co/W2kwjjaUZr'),
 Status(ID=882984113619296256, ScreenName=SenatorMenendez, Created=Thu Jul 06 15:26:32 +0000 2017, Text="Is it too much to ask that a #HealthcareBill actually improve health care? We're LIVE a

In [11]:
from tep.dataPreprocessor import DataPreprocessor
dp = DataPreprocessor()

In [12]:
# test extracting content
texts = dp.extract_content(tweets)
texts[:5]

['<user> thanks for your support this september ! we appreciate all you do for the kids of <hashtag> st jude !',
 '<user> <user> won’t last . <repeat>',
 '<user> <user> congrats 👌🏼 she is adorable and amazingly chill . wishing her a beautiful life of joy and success',
 '<user> so sorry to hear the pain you went through after that shooting - and hopefully we can figure out how… <url>',
 "is it too much to ask that a <hashtag> healthcare bill actually improve health care ? we're live <allcaps> at englewood hospitals <url>"]

In [13]:
# save tweet texts to file
from tep.utils import save_as_text
save_as_text(texts, "data/tweet_texts_v3.txt")

In [21]:
# extract auxiliary features
feats = dp.extract_additional_features(tweets)
feats.shape

(1293005, 24)

In [22]:
# save aux features to file
from tep.utils import save_array
save_array(feats, "data/meta_features_v3.bc")

In [23]:
# extract regression labels
counts = dp.extract_labels(tweets)
counts[:5]

array([ 0,  1,  0,  0, 15])

In [24]:
# save regression labels to file
save_array(counts, "data/raw_labels_v3.bc")

In [25]:
# create log labels
import numpy as np
log_counts = np.log1p(counts)
log_counts[:5]

array([ 0.        ,  0.69314718,  0.        ,  0.        ,  2.77258872])

In [26]:
# save log labels to file
save_array(log_counts, "data/log_labels_v3.bc")

In [27]:
# free up memory
tweets = None

In [28]:
# prepare transfer learning data set
import pandas as pd
PATH = "data/sentiment_data/"

In [29]:
raw = pd.read_csv(f'{PATH}training.csv', 
                  low_memory=False, 
                  encoding='latin1', 
                  header=None, 
                  names=['sentiment', 'tweet_id', 'date', 'no_query', 'user', 'tweet'])
raw.head()

Unnamed: 0,sentiment,tweet_id,date,no_query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [30]:
raw.sentiment.replace(to_replace={0: 0, 4: 1}, inplace=True)
raw.sentiment = raw.sentiment.astype('category')
raw.sentiment.unique()

[0, 1]
Categories (2, int64): [0, 1]

In [31]:
raw.drop(['date', 'no_query', 'user'], axis=1, inplace=True)
raw = raw.sample(frac=1, random_state=1000).reset_index(drop=True)
raw.head()

Unnamed: 0,sentiment,tweet_id,tweet
0,1,1972192771,"@tanpaula drawing people in the cafe, tan"
1,0,2218262456,tired and not sure how this twitter works
2,1,2184969347,Got the job at kohls woo hoo so I'm gettin my ...
3,1,2062054002,So increibly tired. Think Haus is coming along...
4,1,2057224676,http://twitpic.com/6rlfu - Who knew bikers wer...


In [32]:
tweets = raw.tweet.copy()
tl_labels = np.array(raw.sentiment.copy())
raw = None

In [33]:
for i, t in enumerate(tweets):
    if i % 100000 == 0: print(i)
    tweets[i] = t.encode('latin1').decode('utf-8', errors='ignore')

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [34]:
from tep.tweetPreprocessor import tokenize
tokenized_texts = []
for t in tweets:
    tokenized_texts.append(tokenize(t))
tweets = None

In [35]:
from tep.utils import save_array, save_as_text
save_array(tl_labels, f'{PATH}labels_v3.bc')
save_as_text(tokenized_texts, f'{PATH}tweets_v3.txt')

In [36]:
# combine texts from both tasks for joint embedding
for t in tokenized_texts:
    texts.append(t)
len(texts)

2893005

In [37]:
# create embedding
from tep.embeddingGenerator import EmbeddingGenerator
eg = EmbeddingGenerator()

Using TensorFlow backend.


In [38]:
# create word index
word_index = eg.generate_word_index(texts)
word_index

{'.': 1,
 '<user>': 2,
 'to': 3,
 'the': 4,
 '<url>': 5,
 '!': 6,
 'i': 7,
 '<allcaps>': 8,
 'a': 9,
 '<hashtag>': 10,
 'and': 11,
 'for': 12,
 'you': 13,
 'in': 14,
 'of': 15,
 '<repeat>': 16,
 'is': 17,
 'my': 18,
 '<number>': 19,
 'on': 20,
 'it': 21,
 '?': 22,
 'with': 23,
 'this': 24,
 'that': 25,
 'at': 26,
 'your': 27,
 'have': 28,
 'so': 29,
 'we': 30,
 'be': 31,
 'me': 32,
 'are': 33,
 'but': 34,
 'just': 35,
 'our': 36,
 'not': 37,
 '&amp;': 38,
 'was': 39,
 "i'm": 40,
 'out': 41,
 'can': 42,
 'from': 43,
 'up': 44,
 'all': 45,
 'now': 46,
 'like': 47,
 'get': 48,
 'day': 49,
 'about': 50,
 'will': 51,
 'good': 52,
 '-': 53,
 '/': 54,
 '<elong>': 55,
 'more': 56,
 'do': 57,
 'no': 58,
 'what': 59,
 'us': 60,
 'how': 61,
 'today': 62,
 'new': 63,
 'go': 64,
 'thanks': 65,
 'as': 66,
 'if': 67,
 "it's": 68,
 'time': 69,
 'love': 70,
 'one': 71,
 'see': 72,
 'work': 73,
 'by': 74,
 'an': 75,
 'know': 76,
 'going': 77,
 'back': 78,
 'u': 79,
 'too': 80,
 'got': 81,
 'great': 82,


In [39]:
len(word_index)

721695

In [40]:
# create sequences
seq_32 = eg.generate_sequences(texts=texts[:counts.shape[0]], maxlen=32)
seq_48 = eg.generate_sequences(texts=texts[:counts.shape[0]], maxlen=48)
print(seq_32.shape)
print(seq_48.shape)

(1293005, 32)
(1293005, 48)


In [41]:
seq_32[:3]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     2,    65,    12,    27,   252,    24,  2594,
            6,    30,   589,    45,    13,    57,    12,     4,   438,
           15,    10,  1096,  3815,     6],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     2,
            2,  2710,   108,     1,    16],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     2,     2,   369, 28608,   172,    17,
         2444,    11,  7718,  2422,     1,   903,   119,     9,   419,
          197,    15,  2203,    11,  1132]], dtype=int32)

In [42]:
save_array(seq_32, "data/sequences_len32_v3.bc")
save_array(seq_48, "data/sequences_len48_v3.bc")

In [43]:
# create embedding matrix
# create 100-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.100d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_100dim_v3.bc")

1193514
(721696, 100)


In [44]:
# save word index
import json
with open('data/word_index_v3.json', 'w') as fp:
    json.dump(word_index, fp)

In [45]:
# save sorted words
import operator
sorted_words = sorted(word_index.items(), key=operator.itemgetter(1))
sorted_words = [w[0] for w in sorted_words]
sorted_words = ['unknown'] + sorted_words
sorted_words[:10]

['unknown', '.', '<user>', 'to', 'the', '<url>', '!', 'i', '<allcaps>', 'a']

In [46]:
save_as_text(sorted_words, 'data/word_labels_v3.tsv')