# Step 3: Data Preprocessing

The following steps need to be completed in order to prepare the data for model development:
1. Load data from file
2. Randomize tweet order
3. Extract and save tweet content
4. Extract and save structured features
5. Extract and save labels
6. Prepare word embeddings

## Load data from file

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from tep.dataLoader import DataLoader
dl = DataLoader()

In [3]:
# load tweets from file
tweets = dl.load_from_file(filename="data/tweets_1.json", ignore_retweets=True)
len(tweets)

284701

In [4]:
tweets += dl.load_from_file(filename="data/tweets_2.json", ignore_retweets=True)
len(tweets)

565486

In [5]:
tweets += dl.load_from_file(filename="data/tweets_3.json", ignore_retweets=True)
len(tweets)

781721

In [6]:
tweets += dl.load_from_file(filename="data/tweets_4.json", ignore_retweets=True)
len(tweets)

1031077

In [7]:
tweets += dl.load_from_file(filename="data/tweets_5.json", ignore_retweets=True)
len(tweets)

1293005

In [8]:
tweets[:5]

[Status(ID=979429522687762432, ScreenName=jack, Created=Thu Mar 29 18:46:09 +0000 2018, Text='Something we’ve been wanting for a while: choose where live video should start playing and tweet it https://t.co/0KstH8TcPZ'),
 Status(ID=979169950169317381, ScreenName=jack, Created=Thu Mar 29 01:34:42 +0000 2018, Text='@freialobo I’d never decline your meeting'),
 Status(ID=979100832535736320, ScreenName=jack, Created=Wed Mar 28 21:00:03 +0000 2018, Text='@tperzyk Thank you so much Tim!'),
 Status(ID=979026218027597824, ScreenName=jack, Created=Wed Mar 28 16:03:34 +0000 2018, Text='Full discussion with Maryam @TopekaKSam and @ShakaSenghor now available 👇🏼 https://t.co/gfbkmpJeja'),
 Status(ID=978818682062225408, ScreenName=jack, Created=Wed Mar 28 02:18:54 +0000 2018, Text='@chrissyteigen ✋🏼🤚🏼')]

## Randomize tweet order

In [9]:
# randomize tweet order
import random
random.shuffle(tweets)
tweets[:5]

[Status(ID=737281041954689024, ScreenName=MikeCrapo, Created=Mon May 30 13:54:33 +0000 2016, Text='Today we honor those who have made the ultimate sacrifice in service to our country #GratefulNation #MemorialDay2016 https://t.co/aMb41SQ7EN'),
 Status(ID=968969472332763137, ScreenName=DJohnsonPGA, Created=Wed Feb 28 22:01:39 +0000 2018, Text='Thank you to the people in Mexico for the awesome welcome back. Should be a great week at @WGCMexico https://t.co/vuyhmlvQPG'),
 Status(ID=841378536485244930, ScreenName=kindredhealth, Created=Mon Mar 13 20:00:49 +0000 2017, Text='How Does Daylight Saving Time Affect Your Health? https://t.co/81Yq3IbFg5'),
 Status(ID=851514286979645440, ScreenName=HormelFoods, Created=Mon Apr 10 19:16:41 +0000 2017, Text="@rachelariellie It is possible! Distribution is limited, but 10oz La Victoria enchilada sauces are sold in major retailers such as Sobey's."),
 Status(ID=797443229209731072, ScreenName=jpmorgan, Created=Sat Nov 12 14:17:36 +0000 2016, Text="We're 

## Extract and save tweet content

In [10]:
from tep.dataPreprocessor import DataPreprocessor
dp = DataPreprocessor()

In [11]:
# test extracting content
texts = dp.extract_content(tweets)
texts[:5]

['today we honor those who have made the ultimate sacrifice in service to our country <hashtag> grateful nation <hashtag> memorial day<number> <url>',
 'thank you to the people in mexico for the awesome welcome back . should be a great week at <user> <url>',
 'how does daylight saving time affect your health ? <url>',
 "<user> it is possible ! distribution is limited, but <number>oz la victoria enchilada sauces are sold in major retailers such as sobey's .",
 "we're live <user> with the $jpm <allcaps> <hashtag> pictures in pictures exhibit . check out the scene at <hashtag> paris photo<number> 📸 <url>"]

In [12]:
# save tweet texts to file
from tep.utils import save_as_text
save_as_text(texts, "data/tweet_texts_v2.txt")

## Extract and save structured (auxiliary) features

In [13]:
# extract auxiliary features
feats = dp.extract_additional_features(tweets)
feats[:5]

array([[0.00000000e+00, 2.00000000e+00, 0.00000000e+00, 1.40000000e+02,
        5.98960000e+04, 7.33000000e+02, 1.00000000e+00, 1.48700000e+03,
        5.60900000e+03, 2.54376417e+00, 5.30000000e+02, 2.40362812e-01,
        2.20500000e+03, 1.30000000e+01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.24000000e+02,
        7.14364000e+05, 1.27000000e+02, 1.00000000e+00, 2.59400000e+03,
        3.90300000e+03, 1.46070359e+00, 9.01000000e+02, 3.37200599e-01,
        2.67200000e+03, 2.20000000e+01, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.30000000e+01,
        5.47200000e+03, 2.16700000e+03, 0.00000000e+00, 1.62000000e+02,
        4.83000000e+03, 1.44914491e+00, 1.30600000e+03, 3.91839184e-01,
        3.33300000e+03, 2.00000000e+01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.39000000e+02,
        6.13270000e+04, 1.51700000e+03, 1.00000000e+00, 5.74000000e+02,
        1.25840000e+04, 3.30983693e+0

In [14]:
feats.shape

(1293005, 15)

In [15]:
# save aux features to file
from tep.utils import save_array
save_array(feats, "data/auxiliary_features_v2.bc")

In [16]:
# test loading the data
from tep.utils import load_array
feats = load_array("data/auxiliary_features_v2.bc")
feats[:1]

array([[0.00000000e+00, 2.00000000e+00, 0.00000000e+00, 1.40000000e+02,
        5.98960000e+04, 7.33000000e+02, 1.00000000e+00, 1.48700000e+03,
        5.60900000e+03, 2.54376417e+00, 5.30000000e+02, 2.40362812e-01,
        2.20500000e+03, 1.30000000e+01, 0.00000000e+00]])

## Extract and save labels

### Labels for regression task

In [17]:
# extract regression labels
counts = dp.extract_labels(tweets)
counts[:5]

array([ 9, 64,  0,  0, 16])

In [18]:
counts.shape

(1293005,)

In [19]:
# save regression labels to file
save_array(counts, "data/regression_labels_v2.bc")

### Labels for classification task

In [20]:
# get class configuration
from tep.config import Config
config = Config()
buckets = config.CLASSES
buckets

[0, 9, 99, 999]

In [21]:
# extract classification labels
classes = dp.extract_labels(tweets, classes=buckets)
classes[:5]

array([1., 2., 0., 0., 2.])

In [22]:
classes.shape

(1293005,)

In [23]:
save_array(classes, "data/classification_labels_v2.bc")

In [24]:
# free up memory
tweets = None

## Prepare word embeddings

### Create word index

In [25]:
# prepare word embedding
from tep.embeddingGenerator import EmbeddingGenerator
eg = EmbeddingGenerator()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [26]:
word_index = eg.generate_word_index(texts)

In [27]:
word_index

{'<url>': 1,
 '.': 2,
 '<user>': 3,
 'the': 4,
 'to': 5,
 '<hashtag>': 6,
 '<allcaps>': 7,
 'a': 8,
 'for': 9,
 '!': 10,
 'of': 11,
 'in': 12,
 'and': 13,
 'you': 14,
 'is': 15,
 '<number>': 16,
 'on': 17,
 'we': 18,
 'your': 19,
 'this': 20,
 'with': 21,
 'our': 22,
 '?': 23,
 'at': 24,
 '&amp;': 25,
 'i': 26,
 'are': 27,
 'that': 28,
 'it': 29,
 'be': 30,
 'us': 31,
 'can': 32,
 'have': 33,
 'from': 34,
 '/': 35,
 'more': 36,
 'about': 37,
 'will': 38,
 'by': 39,
 'my': 40,
 'how': 41,
 'out': 42,
 'so': 43,
 'thanks': 44,
 'please': 45,
 'as': 46,
 'new': 47,
 'not': 48,
 'what': 49,
 'help': 50,
 '<repeat>': 51,
 'if': 52,
 'an': 53,
 'all': 54,
 '-': 55,
 'like': 56,
 'was': 57,
 'thank': 58,
 'great': 59,
 'see': 60,
 'but': 61,
 'up': 62,
 'trump': 63,
 'dm': 64,
 'w': 65,
 'get': 66,
 'day': 67,
 'do': 68,
 'has': 69,
 "we're": 70,
 'one': 71,
 'just': 72,
 'today': 73,
 'now': 74,
 'time': 75,
 'who': 76,
 'sorry': 77,
 'team': 78,
 'their': 79,
 'or': 80,
 'learn': 81,
 'they

In [28]:
len(word_index)

387784

### Generate sequences

In [29]:
seq_32 = eg.generate_sequences(texts=texts, maxlen=32)
seq_48 = eg.generate_sequences(texts=texts, maxlen=48)
print(seq_32.shape)
print(seq_48.shape)

(1293005, 32)
(1293005, 48)


In [30]:
seq_32[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   73,
          18,  403,  207,   76,   33,  233,    4, 2179, 2976,   12,  183,
           5,   22,  361,    6, 1144,  729,    6, 1632, 2789,    1],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          58,   14,    5,    4,  107,   12, 1581,    9,    4,  527,  332,
         116,    2,  130,   30,    8,   59,  150,   24,    3,    1],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          41,  272, 6672, 1636,   75, 2532,   19,  125,   23,    1]],
      dtype=int32)

In [31]:
save_array(seq_32, "data/sequences_len32_v2.bc")
save_array(seq_48, "data/sequences_len48_v2.bc")

### Generate embedding matrices

**Assumption:** Pretrained word vectors from [GloVe](https://nlp.stanford.edu/projects/glove/) are extracted into `glove` directory (download [here](http://nlp.stanford.edu/data/glove.twitter.27B.zip)).

In [32]:
# create 25-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.25d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_25dim_v2.bc")

1193514
(387785, 25)


In [33]:
emb_matrix[:10]

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 8.03839982e-01, -1.03659999e+00, -5.38770020e-01,
        -1.08060002e+00,  8.47180009e-01, -3.61959994e-01,
         1.00650001e+00,  1.30669999e+00, -6.12249970e-01,
         3.07810009e-01,  4.69740003e-01, -2.32639998e-01,
        -3.38820004e+00, -4.67779994e-01, -5.51050007e-01,
        -1.69260001e+00, -7.87079990e-01,  2.83780009e-01,
        -7.36379981e-01,  1.02159999e-01, -1.87030002e-01,
        -2.13299990e+00, -1.77870005e-01, -9.77880001e-01,
         1.39400005e+00],
    

In [34]:
# create 50-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.50d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_50dim_v2.bc")

1193515
(387785, 50)


In [35]:
# create 100-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.100d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_100dim_v2.bc")

1193516
(387785, 100)


In [36]:
# create 200-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.200d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_200dim_v2.bc")

1193517
(387785, 200)


## Save word index for later lookups

In [37]:
len(word_index)

387784

In [38]:
type(word_index)

dict

In [39]:
import json
with open('data/word_index.json', 'w') as fp:
    json.dump(word_index, fp)

In [40]:
import operator
sorted_words = sorted(word_index.items(), key=operator.itemgetter(1))

In [41]:
type(sorted_words)

list

In [42]:
len(sorted_words)

387784

In [43]:
sorted_words[:10]

[('<url>', 1),
 ('.', 2),
 ('<user>', 3),
 ('the', 4),
 ('to', 5),
 ('<hashtag>', 6),
 ('<allcaps>', 7),
 ('a', 8),
 ('for', 9),
 ('!', 10)]

In [44]:
sorted_words = [w[0] for w in sorted_words]
sorted_words[:10]

['<url>',
 '.',
 '<user>',
 'the',
 'to',
 '<hashtag>',
 '<allcaps>',
 'a',
 'for',
 '!']

In [45]:
sorted_words = ['unknown'] + sorted_words

['unknown',
 '<url>',
 '.',
 '<user>',
 'the',
 'to',
 '<hashtag>',
 '<allcaps>',
 'a',
 'for',
 '!',
 'of',
 'in',
 'and',
 'you',
 'is',
 '<number>',
 'on',
 'we',
 'your',
 'this',
 'with',
 'our',
 '?',
 'at',
 '&amp;',
 'i',
 'are',
 'that',
 'it',
 'be',
 'us',
 'can',
 'have',
 'from',
 '/',
 'more',
 'about',
 'will',
 'by',
 'my',
 'how',
 'out',
 'so',
 'thanks',
 'please',
 'as',
 'new',
 'not',
 'what',
 'help',
 '<repeat>',
 'if',
 'an',
 'all',
 '-',
 'like',
 'was',
 'thank',
 'great',
 'see',
 'but',
 'up',
 'trump',
 'dm',
 'w',
 'get',
 'day',
 'do',
 'has',
 "we're",
 'one',
 'just',
 'today',
 'now',
 'time',
 'who',
 'sorry',
 'team',
 'their',
 'or',
 'learn',
 'they',
 'know',
 'his',
 'no',
 'good',
 'me',
 'hear',
 'make',
 'via',
 'would',
 'he',
 'when',
 'happy',
 "it's",
 'here',
 '.<user>',
 '.…',
 'into',
 'hi',
 'work',
 'there',
 '<allcaps>:',
 'love',
 'need',
 'look',
 'people',
 'u',
 '."',
 's',
 'some',
 'c',
 'these',
 'been',
 'check',
 'back',
 

In [None]:
save_as_text(sorted_words, 'data/word_labels.tsv')