# Preparing to build a Twitter language model

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [127]:
num_words = 100
embedding_dimension = 200

## Load data

In [2]:
from tep.dataLoader import DataLoader
dl = DataLoader()

In [3]:
# load tweets from file
tweets = dl.load_from_file(filename="data/tweets_1.json", ignore_retweets=True)
len(tweets)

284701

In [4]:
# randomize tweet order
import random
random.seed(1000)
random.shuffle(tweets)

## Tokenize texts

In [5]:
from tep.dataPreprocessor import DataPreprocessor
dp = DataPreprocessor()

In [90]:
# test extracting content
texts = dp.extract_content(tweets[:100])
texts

['<hashtag> popovichkerr <number>   <user>   <user>   <url>',
 ' <user>  we want to get this reported over to our quality control team and make up for that purchase .  ( <number>  /  <number> )',
 '"the  <number> nd - order effects when electric, self - driving, software - powered cars become mainstream" <url> <url>',
 'great event today at the harris school of public policy :  <url>',
 ' <user>  are not they just amazing ? ',
 'what is your biggest data challenge ?  revolutionize your <hashtag> cyber analytics at <hashtag> bhusa <allcaps>  <allcaps> <url> <url>',
 'cat  <number> d <number>  xhp <allcaps>  delivers multi - functional solution to large estate in englan <smile>  <url> <url>',
 'always moving the needle !  <url>',
 ' <user>  congratulations',
 'but they are such convincing lies .  <repeat> <url>',
 'sheryl sandberg on cnbc <allcaps>  at  <number>  do not miss it .  <url>',
 ' <user>  our apologies, and sorry for the trouble earlier . ',
 ' <user>   <user>   <user>   <user

In [91]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [92]:
tokenizer = Tokenizer(num_words=100, filters='"”“#$%&()*+,/=@[]^_´`‘{|}~\t\n\\')

In [93]:
tokenizer.fit_on_texts(texts)

In [94]:
len(tokenizer.word_index)

729

In [95]:
tokenizer.word_index

{'!': 15,
 '-': 10,
 '.': 2,
 ':': 16,
 ';': 24,
 '<allcaps>': 6,
 '<hashtag>': 7,
 '<number>': 8,
 '<repeat>': 9,
 '<smile>': 65,
 '<url>': 1,
 '<user>': 3,
 '?': 17,
 'a': 13,
 'about': 34,
 'above': 706,
 'access': 450,
 'account': 409,
 'action': 354,
 'advertising': 404,
 'against': 556,
 'ago': 360,
 'agricultural': 675,
 'ahead': 466,
 'air': 670,
 'aka': 408,
 'al': 298,
 'all': 112,
 'also': 275,
 'always': 66,
 'am': 41,
 'amazing': 189,
 'amen': 729,
 'amp': 23,
 'an': 51,
 'analytica': 488,
 'analytics': 193,
 'and': 20,
 'andrew': 274,
 'announcer': 685,
 'answer': 265,
 'answered': 709,
 'any': 266,
 'api': 593,
 'apologies': 212,
 'app': 217,
 'apparently': 600,
 'appears': 289,
 'apps': 725,
 'ar': 674,
 'are': 26,
 'arkansas': 285,
 'as': 284,
 'ask': 724,
 'asking': 525,
 'association': 430,
 'at': 33,
 'attached': 260,
 'authentication': 586,
 'automatic': 426,
 'available': 456,
 'b': 148,
 'backed': 570,
 'banning': 421,
 'barring': 716,
 'be': 57,
 'because': 599,

In [116]:
!mkdir data/lang_model

In [118]:
# save word index
import json
with open('data/lang_model/word_index.json', 'w') as fp:
    json.dump(tokenizer.word_index, fp)

In [121]:
# save sorted words
import operator
sorted_words = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))
sorted_words = [w[0] for w in sorted_words]
sorted_words = ['<unknown>'] + sorted_words
sorted_words[:10]

['<unknown>',
 '<url>',
 '.',
 '<user>',
 'to',
 'the',
 '<allcaps>',
 '<hashtag>',
 '<number>',
 '<repeat>']

In [122]:
from tep.utils import save_as_text
save_as_text(sorted_words, 'data/lang_model/word_labels.tsv')

## Create sequences

In [96]:
seqs = tokenizer.texts_to_sequences(texts)
seqs

[[7, 8, 3, 3, 1],
 [3, 25, 88, 4, 89, 31, 60, 4, 32, 61, 20, 42, 43, 14, 18, 2, 8, 8],
 [5, 8, 10, 90, 44, 91, 10, 92, 10, 1, 1],
 [62, 93, 94, 33, 5, 11, 63, 16, 1],
 [3, 26, 35, 64, 38, 17],
 [36, 27, 29, 95, 96, 17, 29, 7, 33, 7, 6, 6, 1, 1],
 [97, 8, 98, 8, 6, 99, 10, 4, 19, 65, 1, 1],
 [66, 5, 15, 1],
 [3],
 [64, 26, 2, 9, 1],
 [21, 6, 33, 8, 39, 35, 22, 2, 1],
 [3, 32, 20, 67, 14, 5, 2],
 [3, 3, 3, 3, 27, 68, 4, 42, 29, 2, 45, 2, 9, 1],
 [46, 33, 5, 11, 4, 37, 40, 34, 32, 69, 15, 2, 9, 1],
 [12, 26, 5, 68, 22, 2, 1, 7, 7],
 [1],
 [94, 25, 6, 70, 2, 21, 31, 19, 8, 71, 14, 2, 1],
 [3, 16, 5, 6, 47, 8, 5, 11, 10, 20, 10, 8, 8],
 [37, 40, 34, 72, 4, 48, 4, 6, 16, 1, 2, 1],
 [4, 11, 7, 16, 1, 7, 68, 73, 49, 98, 1],
 [3, 25, 74, 5, 75, 14, 12, 2, 22, 12, 76, 2, 9, 1],
 [12, 19, 6, 50, 6, 6],
 [3, 77, 15, 25, 26, 14, 12, 22, 15, 2, 10, 6],
 [28, 34, 45, 51, 7, 7, 30, 3, 2, 1],
 [3, 22, 14, 31, 93, 5, 27, 5, 2, 2, 10, 6],
 [25, 52, 48, 4, 20, 33, 4, 53, 19, 20, 78, 2, 9, 1],
 [45, 21, 16

In [100]:
seq_lens = [len(s) for s in seqs]

In [111]:
# calculate 90th percentile
import numpy as np

percentiles = [90, 95, 99, 100]
for p in percentiles:
    print('{}th percentile:'.format(p), int(np.round(np.percentile(seq_lens, p))))

90th percentile: 16
95th percentile: 19
99th percentile: 21
100th percentile: 24


In [112]:
max_len = 21
padded_seqs = pad_sequences(seqs, maxlen=max_len)

In [119]:
from tep.utils import save_array

In [120]:
save_array(padded_seqs, 'data/lang_model/seqs.bc')

## Create inputs and labels

In [132]:
input_seqs = []
for seq in seqs:
    for i in range(1, len(seq)):
        n_gram_seq = seq[:i+1]
        input_seqs.append(n_gram_seq)

In [135]:
padded_input_seqs = np.array(pad_sequences(input_seqs, maxlen=max_len))

In [137]:
inputs, labels = padded_input_seqs[:,:-1], padded_input_seqs[:,-1]

In [139]:
import keras.utils as ku

In [140]:
labels = ku.to_categorical(labels, num_classes=(num_words+1))

In [142]:
save_array(inputs, 'data/lang_model/inputs.bc')
save_array(labels, 'data/lang_model/labels.bc')

## Create embedding matrix

In [124]:
# generate embeddings index
f = open('glove/glove.twitter.27B.200d.txt', encoding='utf-8')
embeddings_index = {}
for line in f:
    vals = line.split()
    word = vals[0]
    coeffs = np.asarray(vals[1:], dtype='float32')
    embeddings_index[word] = coeffs
f.close()

In [129]:
# fill embedding matrix
embedding_matrix = np.zeros((num_words + 1, embedding_dimension))
for word, i in tokenizer.word_index.items():
    emb_vec = embeddings_index.get(word)
    if emb_vec is not None and i <= num_words:
        embedding_matrix[i] = emb_vec

In [131]:
save_array(embedding_matrix, 'data/lang_model/emb_mat.bc')

## Conclusion

In [143]:
!ls data/lang_model/

[1m[36memb_mat.bc[m[m      [1m[36mlabels.bc[m[m       word_index.json
[1m[36minputs.bc[m[m       [1m[36mseqs.bc[m[m         word_labels.tsv
