# Step 3: Data Preprocessing

The following steps need to be completed in order to prepare the data for model development:
1. Load data from file
2. Randomize tweet order
3. Extract and save tweet content
4. Extract and save structured features
5. Extract and save labels
6. Prepare word embeddings

## Load data from file

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from tep.dataLoader import DataLoader
dl = DataLoader()

In [3]:
# load tweets from file
tweets = dl.load_from_file(filename="data/tweets_1.json", ignore_retweets=True)
len(tweets)

284701

In [4]:
tweets += dl.load_from_file(filename="data/tweets_2.json", ignore_retweets=True)
len(tweets)

565486

In [5]:
tweets += dl.load_from_file(filename="data/tweets_3.json", ignore_retweets=True)
len(tweets)

781721

In [6]:
tweets += dl.load_from_file(filename="data/tweets_4.json", ignore_retweets=True)
len(tweets)

1031077

In [7]:
tweets += dl.load_from_file(filename="data/tweets_5.json", ignore_retweets=True)
len(tweets)

1293005

In [8]:
tweets[:5]

[Status(ID=979429522687762432, ScreenName=jack, Created=Thu Mar 29 18:46:09 +0000 2018, Text='Something we’ve been wanting for a while: choose where live video should start playing and tweet it https://t.co/0KstH8TcPZ'),
 Status(ID=979169950169317381, ScreenName=jack, Created=Thu Mar 29 01:34:42 +0000 2018, Text='@freialobo I’d never decline your meeting'),
 Status(ID=979100832535736320, ScreenName=jack, Created=Wed Mar 28 21:00:03 +0000 2018, Text='@tperzyk Thank you so much Tim!'),
 Status(ID=979026218027597824, ScreenName=jack, Created=Wed Mar 28 16:03:34 +0000 2018, Text='Full discussion with Maryam @TopekaKSam and @ShakaSenghor now available 👇🏼 https://t.co/gfbkmpJeja'),
 Status(ID=978818682062225408, ScreenName=jack, Created=Wed Mar 28 02:18:54 +0000 2018, Text='@chrissyteigen ✋🏼🤚🏼')]

## Randomize tweet order

In [9]:
# randomize tweet order
import random
random.shuffle(tweets)
tweets[:5]

[Status(ID=871825684414566400, ScreenName=tobykeith, Created=Mon Jun 05 20:26:55 +0000 2017, Text="When she says she doesn't date guys who drive pickups. https://t.co/pzzaJJacyy"),
 Status(ID=830086028782354432, ScreenName=AmTrustInsured, Created=Fri Feb 10 16:08:26 +0000 2017, Text='Are you a wood products manufacturer? Call your agent to learn loss prevention tips to help lower lumber premiums:… https://t.co/xWhlA2Q9lU'),
 Status(ID=883337723515465728, ScreenName=AEPnews, Created=Fri Jul 07 14:51:39 +0000 2017, Text='#JOB: We are #hiring an Instrument Technician in Pittsburg, TX. | https://t.co/bdVrd9oioH'),
 Status(ID=733010516348653569, ScreenName=jpmorgan, Created=Wed May 18 19:05:00 +0000 2016, Text='We welcome Peter Meath to $JPM as our new head of life sciences in the commercial banking business. https://t.co/rfFTFBZCjr'),
 Status(ID=872902594976518144, ScreenName=MotoSolutions, Created=Thu Jun 08 19:46:11 +0000 2017, Text='@hemilp9 We wish we could help you, but the device you

## Extract and save tweet content

In [10]:
from tep.dataPreprocessor import DataPreprocessor
dp = DataPreprocessor()

In [11]:
# test extracting content
texts = dp.extract_content(tweets)
texts[:5]

["when she says she doesn't date guys who drive pickups . <url>",
 'are you a wood products manufacturer ? call your agent to learn loss prevention tips to help lower lumber premiums:… <url>',
 '<hashtag> job <allcaps>: <allcaps> we are <hashtag> hiring an instrument technician in pittsburg, tx <allcaps> . | <url>',
 'we welcome peter meath to $jpm <allcaps> as our new head of life sciences in the commercial banking business . <url>',
 '<user> we wish we could help you, but the device you described is not ours . visit <url> for help from <user>']

In [12]:
# save tweet texts to file
from tep.utils import save_as_text
save_as_text(texts, "data/tweet_texts.txt")

## Extract and save structured (auxiliary) features

In [13]:
# extract auxiliary features
feats = dp.extract_additional_features(tweets)
feats[:5]

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          7.80000000e+01,   1.28217000e+06,   4.02900000e+03,
          1.00000000e+00,   3.13200000e+03,   1.36230000e+04,
          4.04964328e+00,   4.13400000e+03,   1.22889417e+00,
          3.36400000e+03,   2.00000000e+01,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.39000000e+02,   2.56700000e+03,   2.93100000e+03,
          0.00000000e+00,   4.60000000e+01,   2.31800000e+03,
          1.11549567e+00,   2.01000000e+02,   9.67276227e-02,
          2.07800000e+03,   1.60000000e+01,   0.00000000e+00],
       [  1.00000000e+00,   2.00000000e+00,   0.00000000e+00,
          8.90000000e+01,   1.45800000e+04,   2.44700000e+03,
          1.00000000e+00,   3.80000000e+02,   7.33300000e+03,
          2.22077529e+00,   1.38500000e+03,   4.19442762e-01,
          3.30200000e+03,   1.40000000e+01,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
     

In [14]:
feats.shape

(1293005, 15)

In [15]:
# save aux features to file
from tep.utils import save_array
save_array(feats, "data/auxiliary_features.bc")

In [16]:
# test loading the data
from tep.utils import load_array
feats = load_array("data/auxiliary_features.bc")
feats[:1]

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          7.80000000e+01,   1.28217000e+06,   4.02900000e+03,
          1.00000000e+00,   3.13200000e+03,   1.36230000e+04,
          4.04964328e+00,   4.13400000e+03,   1.22889417e+00,
          3.36400000e+03,   2.00000000e+01,   0.00000000e+00]])

## Extract and save labels

### Labels for regression task

In [17]:
# extract regression labels
counts = dp.extract_labels(tweets)
counts[:5]

array([120,   0,   3,   5,   0])

In [18]:
counts.shape

(1293005,)

In [19]:
# save regression labels to file
save_array(counts, "data/regression_labels.bc")

### Labels for classification task

In [21]:
# get class configuration
from tep.config import Config
config = Config()
buckets = config.CLASSES
buckets

[0, 9, 99, 999]

In [23]:
# extract classification labels
classes = dp.extract_labels(tweets, classes=buckets)
classes[:5]

array([ 3.,  0.,  1.,  1.,  0.])

In [24]:
classes.shape

(1293005,)

In [25]:
save_array(classes, "data/classification_labels.bc")

In [28]:
# free up memory
tweets = None

## Prepare word embeddings

### Create word index

In [29]:
# prepare word embedding
from tep.embeddingGenerator import EmbeddingGenerator
eg = EmbeddingGenerator()

In [30]:
word_index = eg.generate_word_index(texts)

In [31]:
word_index

{'<url>': 1,
 '.': 2,
 '<user>': 3,
 'the': 4,
 'to': 5,
 '<hashtag>': 6,
 '<allcaps>': 7,
 'a': 8,
 'for': 9,
 '!': 10,
 'of': 11,
 'in': 12,
 'and': 13,
 'you': 14,
 'is': 15,
 '<number>': 16,
 'on': 17,
 'we': 18,
 'your': 19,
 'this': 20,
 'with': 21,
 'our': 22,
 '?': 23,
 'at': 24,
 '&amp;': 25,
 'i': 26,
 'are': 27,
 'that': 28,
 'it': 29,
 'be': 30,
 'us': 31,
 'can': 32,
 'have': 33,
 'from': 34,
 '/': 35,
 'more': 36,
 'about': 37,
 'will': 38,
 'by': 39,
 'my': 40,
 'how': 41,
 'out': 42,
 'so': 43,
 'thanks': 44,
 'please': 45,
 'as': 46,
 'new': 47,
 'not': 48,
 'what': 49,
 'help': 50,
 '<repeat>': 51,
 'if': 52,
 'an': 53,
 'all': 54,
 '-': 55,
 'like': 56,
 'was': 57,
 'thank': 58,
 'great': 59,
 'see': 60,
 'but': 61,
 'up': 62,
 'trump': 63,
 'dm': 64,
 'w': 65,
 'get': 66,
 'day': 67,
 'do': 68,
 'has': 69,
 "we're": 70,
 'one': 71,
 'just': 72,
 'today': 73,
 'now': 74,
 'time': 75,
 'who': 76,
 'sorry': 77,
 'team': 78,
 'their': 79,
 'or': 80,
 'learn': 81,
 'they

In [32]:
len(word_index)

387784

### Generate sequences

In [33]:
seq_32 = eg.generate_sequences(texts=texts, maxlen=32)
seq_48 = eg.generate_sequences(texts=texts, maxlen=48)
print(seq_32.shape)
print(seq_48.shape)

(1293005, 32)
(1293005, 48)


In [34]:
seq_32[:3]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,    93,   304,   149,   304,   605,   867,   942,
           76,   700, 35832,     2,     1],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,    27,    14,     8,  5785,   557,  6539,    23,
          171,    19,  1987,     5,    81,  1885,  2642,   334,     5,
           50,  1423, 13486, 64039,     1],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     6,   259,   103,     7,
           18,    27,     6,   654,    53, 13230,  4243,    12, 37221,
         2259,     7,     2,   625,     1]], dtype=int32)

In [35]:
save_array(seq_32, "data/sequences_len32.bc")
save_array(seq_48, "data/sequences_len48.bc")

### Generate embedding matrices

**Assumption:** Pretrained word vectors from [GloVe](https://nlp.stanford.edu/projects/glove/) are extracted into `glove` directory (download [here](http://nlp.stanford.edu/data/glove.twitter.27B.zip)).

In [36]:
# create 25-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.25d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_25dim.bc")

1193514
(387785, 25)


In [37]:
# create 50-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.50d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_50dim.bc")

1193515
(387785, 50)


In [38]:
# create 100-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.100d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_100dim.bc")

1193516
(387785, 100)


In [39]:
# create 200-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.200d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename="data/embedding_matrix_200dim.bc")

1193517
(387785, 200)
