# Deep learning language model

In [1]:
# importing libraries

import io
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import tensorflow as tf

from collections import Counter
from nltk.tokenize import TreebankWordTokenizer

from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import to_categorical

In [2]:
# variables
POST_TYPE = 'post'
MIN_TOKENS_LEN = 100
MAX_TOKENS_LEN = 200
DATA_SAMPLE_COUNT = 20000

TOKENS_MIN_COUNT = 10
SEQUENCE_WINDOW = 4
SEQUENCE_LEN = 13

### Loading the data

In [3]:
data = pd.read_csv('stackexchange_small_data_tokenized.csv.gz',
                  compression='gzip').sample(frac = 1, random_state = 42).reset_index(drop = True)

In [4]:
print("data.shape: ", data.shape)

data.shape:  (77634, 7)


In [5]:
print(data.text.sample(2).values)

["I have asked myself this question for months. The answers on CrossValidated and Quora all list nice properties of the logistic sigmoid function, but it all seems like we cleverly guessed this function. What I missed was the justification for choosing it. I finally found one in section . . . of the Deep Learning book by Bengio . In my own words In short, we want the logarithm of the model's output to be suitable for gradient-based optimization of the log-likelihood of the training data. Motivation We want a linear model, but we can't use ."
 "Problem is, the marginal likelihood is non-analytical due to the non-Gaussian observation likelihood, so you'll have to go with approximate inference I would try MCMC or variational Bayes here, due to the high uncertainty in the observations . This is the reason why I proposed a solution based on GPs on the bounds as opposed to a GP on the underlying function, since, with the former, one can use the standard analytical machinery. Anyhow, it is in

In [6]:
# transform the tokens field from white space separated strings into list of tokens
data['tokens'] = data.tokens.apply(lambda token: np.array(token.split()))

In [7]:
print(data.tokens.sample().values)

[array(['excellent', 'book', ',', 'indeed', '!', 'however', ',', 'in',
       'p.', ',', 'it', 'recommends', 'both', 'k', 'and', 'k', '.', 'why',
       'would', 'you', 'recommend', 'k', 'for', 'my', 'case', '?'],
      dtype='<U10')]


### Vocabulary

In [8]:
# generate vocabulary; filter out words that are too scare
tokens_data = list(itertools.chain.from_iterable(data.tokens))

# filter out least common tokens
counter_tokens = Counter(tokens_data)

vocab_size = len(set(tokens_data))
vocab = list(set(tokens_data))

# remove all tokens that appear in less than TOKENS_MIN_COUNT times
filtered_tokens = [token for token in tokens_data if counter_tokens[token] > TOKENS_MIN_COUNT]

vocab_size = len(set(filtered_tokens))
vocab = list(set(filtered_tokens))

print("initial number of tokens: ", len(tokens_data))
print("filtered number of tokens: ", len(filtered_tokens))
print("vocabulary size:", vocab_size)

initial number of tokens:  4512785
filtered number of tokens:  4386666
vocabulary size: 11042


In [9]:
# filtered out tokens
filtered_out_tokens = np.unique([token for token in tokens_data if counter_tokens[token] <= TOKENS_MIN_COUNT])

In [10]:
print(filtered_out_tokens)

["'-.k" "'-b" "'-c" ... '😀' '😊' '😔']


In [11]:
print(np.random.choice(filtered_out_tokens, 50, replace = False))

['sub-group' 'themed' 'emerge' 'outs' 'grm.' 'hox' 'nonstochastic.'
 'messes' 'rules-based' 'isometry' 's-curves.' 'afl' 'a\\end' 'min-hash'
 'simplex.' 'ziggurat' 'steffen' 'layers.dense' 'dlogis' 'alba'
 'satterwaite'
 'plot-all-scatterplots-and-peak-those-with-biggest-white-area' 'autocode'
 'd.v.' 'drivings' 'durring' 'discoveries' 'ommited' 'noah' 'datas.'
 'classi-' 'ngene' 'p-vals' 'izenman' 'lookup.' 'punctuations'
 'crash-introduction' 'imply.' 'mccaffrey' 're-posting' 'boss.'
 'salakhutdinov' 'holds..' 'sentence…' 'sprints' 'admm' 'young.' 'chernik'
 'mineral' 'arching']


### Out of Vocabulary

In [12]:
vocab.append('UNKNOWN')
vocab_size += 1

### Tokens as vocabulary indexes

In [13]:
mapping = {x: i for i, x in enumerate(vocab)}

def get_index(token):
    try:
        return mapping[token]
    except:
        return mapping['UNKNOWN']
    
data['tokens_index'] = data.tokens.apply(lambda tokens: np.array([get_index(token) for token in tokens]))

In [14]:
print(data.tokens_index.head(3).values)

[array([ 3908,  1479,  5602,  2856,  1005,  6551,  4523,  8577,   853,
         259,   122,  1005,  1748,  6948,  4523,  7303,  2739,  3402,
       10631,  1571,  4107,  1177,  1005,  3205,  3997, 10812, 10137,
        9542,  6631,  5178,  6174, 11042,  7219,  9394,  4520,  1005,
        5178,  6174, 11042,  1005,  8569,  8955,  9880,  7940,  5316,
        3908,  2072,  9236,  5985,  1557,  2198,  2179,  2291,  2739,
       11042, 11042,  9654])
 array([ 2142,  3967,  9236,  6894,  3967,  9236,  3402,  3242,  7219,
        9253, 11042,  2906,  1479,  3402,  3242,  1512,  5316, 11042,
        9058, 11042,  6336,  5742,  1596,  3908,  1614,  1479,  3913,
        1005,  7298,  2142,  6948,  2406,  2739,  1716,  8255,  1479,
        1005,  1005,  8779,  6479,  1198,  1479,  1005,  1005,  1005,
        2906,  1479,  3402,  3242,  1512,  1005,  5122,  5770,  1479,
        7283,  7006])
 array([ 3997,  4005,  6235,  9542,  4457,  6948,  2761,  6399, 11042,
        8779,  9334,  2739,  9555,  

### Sequence generation

In [15]:
def sequence_generation(word):
    sequences = []
    _end = SEQUENCE_WINDOW
    while _end < len(word) + SEQUENCE_WINDOW:
        sequences.append(word[:_end])
        _end += SEQUENCE_WINDOW
        
    padded_seq = pad_sequences(sequences, maxlen=SEQUENCE_LEN, padding='pre')
    return padded_seq

In [16]:
# using the sequence generation
words_sequences = data.tokens_index.apply(sequence_generation)

In [18]:
from tqdm import tqdm

x = 0
for seq in tqdm(words_sequences.values):
    if x == 0:
        sequences = seq
    else:
        sequences = np.concatenate((sequences, seq))
    x += 1

100%|██████████| 77634/77634 [07:30<00:00, 172.40it/s] 


In [19]:
print("Sequences.shape: ", sequences.shape)

Sequences.shape:  (1157748, 13)


In [26]:
# predictors and labels for the classification task

predictors = sequences[:, :-1]
label = sequences[:, -1]

print("Predictors.shape: ", predictors.shape)
print("Label.shape: ", label.shape)

# keras to_categorical function transform the vocab_size vector of labels into a one hot encoded matrix
# dimension (n, vocab_size)
label_category = to_categorical(label, num_classes=vocab_size)
#print("Label_category.shape: ", label_category.shape)

Predictors.shape:  (1157748, 12)
Label.shape:  (1157748,)


MemoryError: Unable to allocate 47.6 GiB for an array with shape (1157748, 11043) and data type float32

### Model

In [23]:
# define model

embedding_dim = 64

dl_model = Sequential()
dl_model.add(Embedding(vocab_size, embedding_dim, input_length=SEQUENCE_LEN -1))
dl_model.add(LSTM(128, return_sequences=True))
dl_model.add(Dense(vocab_size, activation='softmax'))
optimizer = RMSprop(lr=0.01)

dl_model.compile(loss='categorical_crossentropy', 
                optimizer=optimizer,
                metrics=['accuracy'])

print(dl_model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 12, 64)            706752    
_________________________________________________________________
lstm_1 (LSTM)                (None, 12, 128)           98816     
_________________________________________________________________
dense_1 (Dense)              (None, 12, 11043)         1424547   
Total params: 2,230,115
Trainable params: 2,230,115
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
# model fitting

dl_model.fit(predictors, label_cat, batch_size = 256, epochs=4, verbose=1)

NameError: name 'label_cat' is not defined