# Preparing to build a Twitter language model

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
num_words = 20000
embedding_dimension = 200

## Load data

In [3]:
from tep.dataLoader import DataLoader
dl = DataLoader()

In [4]:
# load tweets from file
tweets = dl.load_from_file(filename="data/tweets_1.json", ignore_retweets=True)
len(tweets)

284701

In [5]:
tweets += dl.load_from_file(filename="data/tweets_2.json", ignore_retweets=True)
len(tweets)

565486

In [6]:
tweets += dl.load_from_file(filename="data/tweets_3.json", ignore_retweets=True)
len(tweets)

781721

In [7]:
tweets += dl.load_from_file(filename="data/tweets_4.json", ignore_retweets=True)
len(tweets)

1031077

In [8]:
tweets += dl.load_from_file(filename="data/tweets_5.json", ignore_retweets=True)
len(tweets)

1293005

In [9]:
# randomize tweet order
import random
random.seed(1000)
random.shuffle(tweets)

## Tokenize texts

In [10]:
from tep.dataPreprocessor import DataPreprocessor
dp = DataPreprocessor()

In [11]:
# test extracting content
texts = dp.extract_content(tweets)
texts[:5]

[' <user>  thanks for your support this september !  we appreciate all you do for the kids of <hashtag> st jude ! ',
 ' <user>   <user>  will not last .  <repeat>',
 ' <user>   <user>  congrats 👌🏼 she is adorable and amazingly chill .  wishing her a beautiful life of joy and success',
 ' <user>  so sorry to hear the pain you went through after that shooting  -  and hopefully we can figure out how  .  <repeat> <url>',
 'is it too much to ask that a <hashtag> healthcare bill actually improve health care ?  we are live <allcaps>  at englewood hospitals <url>']

In [12]:
del tweets

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


You might need to iterate the following and observe the sorted words file in order to determine a suitable vocabulary size.

In [14]:
tokenizer = Tokenizer(num_words=num_words, filters='"”“#$%&()*+,/=@[]^_´`‘{|}~\t\n\\')

In [15]:
tokenizer.fit_on_texts(texts)

In [16]:
len(tokenizer.word_index)

205706

In [17]:
!mkdir data/lang_model

mkdir: cannot create directory ‘data/lang_model’: File exists


In [18]:
# save word index
import json
with open('data/lang_model/word_index.json', 'w') as fp:
    json.dump(tokenizer.word_index, fp)

In [19]:
# save sorted words
import operator
sorted_words = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))
sorted_words = [w[0] for w in sorted_words]
sorted_words = ['<unknown>'] + sorted_words
sorted_words[:10]

['<unknown>',
 '.',
 '<url>',
 '<user>',
 'the',
 'to',
 '<allcaps>',
 '<hashtag>',
 '<number>',
 '<repeat>']

In [20]:
from tep.utils import save_as_text
save_as_text(sorted_words, 'data/lang_model/word_labels.tsv')

In [21]:
del sorted_words

## Create sequences

In [22]:
seqs = tokenizer.texts_to_sequences(texts)
seqs[:5]

[[3,
  47,
  11,
  24,
  129,
  23,
  1997,
  12,
  19,
  371,
  58,
  15,
  43,
  11,
  4,
  398,
  13,
  7,
  312,
  2348,
  12],
 [3, 3, 34, 35, 145, 1, 9],
 [3,
  3,
  229,
  14941,
  236,
  17,
  3876,
  16,
  12791,
  5836,
  1,
  1284,
  165,
  10,
  709,
  188,
  13,
  2512,
  16,
  641],
 [3,
  49,
  83,
  5,
  95,
  4,
  887,
  15,
  1040,
  242,
  149,
  33,
  1278,
  20,
  16,
  2925,
  19,
  38,
  2391,
  44,
  46,
  1,
  9,
  2],
 [17,
  29,
  195,
  189,
  5,
  512,
  33,
  10,
  7,
  390,
  135,
  579,
  485,
  124,
  126,
  30,
  19,
  22,
  137,
  6,
  32,
  3325,
  2]]

In [23]:
del texts

In [24]:
seq_lens = [len(s) for s in seqs]

In [25]:
# calculate length percentiles
import numpy as np

percentiles = [90, 95, 99, 100]
for p in percentiles:
    print('{}th percentile:'.format(p), int(np.round(np.percentile(seq_lens, p))))

90th percentile: 27
95th percentile: 29
99th percentile: 32
100th percentile: 98


In [26]:
max_len = 32
padded_seqs = pad_sequences(seqs, maxlen=max_len)

In [27]:
from tep.utils import save_array

In [28]:
save_array(padded_seqs, 'data/lang_model/seqs.bc')

In [29]:
del padded_seqs

## Create inputs and labels

In [30]:
input_seqs = []
for seq in seqs:
    for i in range(1, len(seq)):
        n_gram_seq = seq[:i+1]
        input_seqs.append(n_gram_seq)

In [31]:
del seqs

In [32]:
input_seqs = np.array(pad_sequences(input_seqs, maxlen=max_len))

In [33]:
input_seqs.shape

(22664353, 32)

In [34]:
# shuffle the input sequences
random.seed(2018)
random.shuffle(input_seqs)

In [35]:
inputs, labels = input_seqs[:,:-1], input_seqs[:,-1]

In [36]:
del input_seqs

In [37]:
import keras.utils as ku

In [38]:
inputs.shape

(22664353, 31)

In [39]:
save_array(inputs, 'data/lang_model/inputs.bc')

In [40]:
del inputs

In [42]:
labels.shape

(22664353,)

In [43]:
save_array(labels, 'data/lang_model/labels.bc')

In [44]:
del labels

## Create embedding matrix

In [45]:
# generate embeddings index
f = open('glove/glove.twitter.27B.200d.txt', encoding='utf-8')
embeddings_index = {}
for line in f:
    vals = line.split()
    word = vals[0]
    coeffs = np.asarray(vals[1:], dtype='float32')
    embeddings_index[word] = coeffs
f.close()

In [46]:
# fill embedding matrix
embedding_matrix = np.zeros((num_words + 1, embedding_dimension))
for word, i in tokenizer.word_index.items():
    emb_vec = embeddings_index.get(word)
    if emb_vec is not None and i <= num_words:
        embedding_matrix[i] = emb_vec

In [47]:
embedding_matrix.shape

(20001, 200)

In [48]:
save_array(embedding_matrix, 'data/lang_model/emb_mat.bc')

## Conclusion

In [49]:
!ls data/lang_model/

emb_mat.bc  inputs.bc  labels.bc  seqs.bc  word_index.json  word_labels.tsv
