<a href="https://colab.research.google.com/github/BYU-Handwriting-Lab/GettingStarted/blob/solution/notebooks/language-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Model

This notebook provides code to create a character-level language model in 
TensorFlow.

### Dependencies

Import the necessary dependencies and download our character set and corpus.

In [1]:
import tensorflow as tf

import string
import random
import re

import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
!wget -q https://raw.githubusercontent.com/ericburdett/named-entity-recognition/master/char_set.json
!wget -q --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ZsJ8cZSDU98GpcK-kl_Cq3eTt-R2YvSJ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1ZsJ8cZSDU98GpcK-kl_Cq3eTt-R2YvSJ" -O french_ner_dataset.csv && rm -rf /tmp/cookies.txt

In [3]:
# ID: 1M26Gpca8Ug4YvRLxoUDDCjMBeJtojITY
!wget -q --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1wDMLz9hTmfvPhkhCHTylbeAU6Utpkqb1' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1wDMLz9hTmfvPhkhCHTylbeAU6Utpkqb1" -O french_text.txt && rm -rf /tmp/cookies.tx

## Load the Corpus

Define some constants to help us know which characters are used for words and
which are used for punctuation/digits.

Load the corpus to be used for tokenization and dataset creation.

In [2]:
DEFAULT_CHARS = ' !"#$%&\'()*+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz|~£§¨«¬\xad' \
                '°²´·º»¼½¾ÀÂÄÇÈÉÊÔÖÜßàáâäæçèéêëìîïñòóôöøùúûüÿłŒœΓΖΤάήαδεηικλμνξοπρτυχψωόώІ‒–—†‡‰‹›₂₤℔⅓⅔⅕⅖⅗⅘⅙⅚⅛∆∇∫≠□♀♂✓ｆ'
# The default list of non-punctuation characters needed for the word beam search decoding algorithm
DEFAULT_NON_PUNCTUATION = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÄÇÈÉÊÔÖÜßàáâäæçèéêëìîïñòóôöøùúûüÿ' \
                          'łŒœΓΖΤάήαδεηικλμνξοπρτυχψωόώІ'

DEFAULT_PUNCTUATION = string.punctuation + '0123456789'

In [3]:
lines = open('french_text.txt', 'r', encoding='utf8').readlines()

french_words = []
for line in lines:
    french_words.extend(line.split())
french_words = ' '.join(french_words)

## Tokenization

One of the hardest parts is creating a good tokenization method.

This tokenizer will create a token for each word. Each punctuation or digit
character will have its own token.

In [4]:
class Tokenizer:
    def __init__(self, corpus, word_chars, punctuation, lower=False):
        self.word_chars = word_chars
        self.punctuation = punctuation
        self.regex = r"[" + self.word_chars + r"]+|[^\s]" 

        words = self.split(corpus)
        all_words_list = words + list(punctuation)
        all_words_list_unique = list(set(all_words_list))
        all_words = [' '.join(all_words_list_unique)]

        self.total_tokens = len(all_words_list_unique) + 2 # +2 to account for 0 (reserved) and 1 (OOV)
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.total_tokens, filters='', lower=lower, oov_token='<OOV>')
        self.tokenizer.fit_on_texts(all_words)

    def split(self, text):
        return re.findall(self.regex, text)

    def texts_to_sequences(self, text):
        words = self.split(text)
        return self.tokenizer.texts_to_sequences([' '.join(words)])

In [5]:
tokenizer = Tokenizer(french_words, DEFAULT_NON_PUNCTUATION, DEFAULT_PUNCTUATION, lower=False)
sentence = 'acte de deces-de..1832(hello)5eme'

print('Original Sentence:', sentence)
print('Split Sentence:', tokenizer.split(sentence))
print('Tokenized Sentence:', tokenizer.texts_to_sequences(sentence))

Original Sentence: acte de deces-de..1832(hello)5eme
Split Sentence: ['acte', 'de', 'deces', '-', 'de', '.', '.', '1', '8', '3', '2', '(', 'hello', ')', '5', 'eme']
Tokenized Sentence: [[40723, 11811, 17855, 11916, 11811, 34176, 34176, 33369, 11084, 2654, 13378, 2522, 1, 38654, 3772, 11895]]


In [6]:
embedding = tf.keras.layers.Embedding(tokenizer.total_tokens, 1024)

sequence = tokenizer.texts_to_sequences(sentence)
sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=1)

tf.squeeze(embedding(sequence))

<tf.Tensor: shape=(1024,), dtype=float32, numpy=
array([-0.04362495, -0.02038902,  0.01931791, ..., -0.048725  ,
       -0.0081457 ,  0.04110047], dtype=float32)>

## Dataset Creation

Create the Tensorflow dataset using the tokenizer created above.

In [7]:
def create_context_pairs(words, window_size, negative_sample_size):
    focus_words = []
    context_words = []
    labels = []

    # Add positive samples
    for index in tqdm(range(len(words)), desc='Positive Samples'):

        # Grab words to the left:
        for i in range(1, window_size + 1):
            left_index = index - i
            if left_index >= 0:
                focus_words.append(words[index])
                context_words.append(words[left_index])
                labels.append(1)
        
        # Grab words to the right:
        for i in range(1, window_size + 1):
            right_index = index + i
            if right_index < len(words):
                focus_words.append(words[index])
                context_words.append(words[right_index])
                labels.append(1)               
    
    # Add negative samples
    for word in tqdm(words, desc='Negative Samples'):
        for i in range(negative_sample_size):
            index = random.randint(0, len(words) - 1)
            focus_words.append(word)
            context_words.append(words[index])
            labels.append(0)

    # Shuffle the dataset
    print('Shuffling dataset...')
    zipped = list(zip(focus_words, context_words, labels))
    random.shuffle(zipped)
    focus_words, context_words, labels = zip(*zipped)
    print('Done.')
    
    return tf.constant(focus_words, dtype=tf.int32), tf.constant(context_words, dtype=tf.int32), tf.constant(labels, dtype=tf.int32)

In [8]:
# Tokenize the entire corpus
tokenized_french_words = tokenizer.texts_to_sequences(french_words)[0]

# Create pairs of words that should be similar
focus_words, context_words, labels = create_context_pairs(tokenized_french_words, 3, 5)

# Create the dataset
dataset = tf.data.Dataset.from_tensor_slices((focus_words, context_words, labels))

Positive Samples: 100%|██████████| 1086647/1086647 [00:03<00:00, 329820.29it/s]
Negative Samples: 100%|██████████| 1086647/1086647 [00:10<00:00, 105715.46it/s]


Shuffling dataset...
Done.


In [9]:
# Show one batch of 100 words
for focus_word, context_word, label in dataset.batch(100).take(1):
    print(focus_word)
    print(context_word)
    print(label)

tf.Tensor(
[ 4671 27998    59  4620 34176 35618 26717 18712 11811  7696 12237 34176
  2355 26298 28959 34151 33813 35400  7050 26781 32178  7696   206 10420
 11496  8508 24171 42402 11811 34761 28959 23865 23293 31696 32178 29249
  2624 20677 11811 11811 29249 32178 38956 21087  2522 23438 18183 28491
 11811  1631  9206 25033  5722 34884 26191 31121  3772 36399 12624  2512
 34761 38654  6936 32649 32063 34761  2368 41950  3778 11811 17699 40589
 41972 39089  8287 32178 32063  8273 27188  7696 39407 32063 33369 11811
 34890 30355  2512 11811 29237 11916 13515 15962  7785 28491 24512 39407
 22895 11811 42402  3772], shape=(100,), dtype=int32)
tf.Tensor(
[27738 34890 32145  7785 10839 34761 31573 11811 26717  9157 40723 32178
 14090 17762 39999 23808 23606 11811 13378 34890  3845 17750 34761 33369
 42279 38780  1575 36227 38780 31573  1465  9206 29249 13378 10532  2012
 34449 36962 10173 30355 11811 27188 30355 20249  4511 36693 14228 13215
 22895 27998 32443 13106 15951  1465  6469 18344

### Model Creation

Build our simple model that includes an embedding layer, recurrent layer, and
dense layer to get us down to the number of classes.

In [10]:
class LanguageModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim=128, embedding_weights):
        super(LanguageModel, self).__init__()

        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.context = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.dot = tf.keras.layers.Dot(axes=(1,1))
    
    def call(self, target, context, training=False):
        if training:
            target = self.embedding(target)
            context = self.context(context)
            dot = self.dot([target, context])
            out = tf.keras.activations.sigmoid(dot)
        else:
            target = self.embedding(target)
            context = self.embedding(context)
            dot = self.dot([target, context])
            out = tf.keras.activations.sigmoid(dot)

        return out

Test it out just to make sure it works.

In [11]:
model = LanguageModel(tokenizer.total_tokens)

loss_fn = tf.keras.losses.BinaryCrossentropy()

for focus_word, context_word, label in dataset.shuffle(5000).batch(5).take(1):
    output = model(focus_word, context_word)
    loss = loss_fn(label, output)
    print('Output Similarity Predictions:', output.numpy())
    print('Output Similarity Actual', label.numpy())
    print('Loss:', loss.numpy())

Output Similarity Predictions: [[0.5001859 ]
 [0.5008042 ]
 [0.50052506]
 [0.49966443]
 [0.49489838]]
Output Similarity Actual [0 0 0 0 1]
Loss: 0.6956704


### Train the Model

Train the model based on the text in our corpus.

The goal is to predict the next character. Thus, the target is the input tensor
rolled by one character.

In [12]:
TRAIN_SPLIT_SIZE = 1.0
SHUFFLE_SIZE = 100_000
BATCH_SIZE = 20_000
EPOCHS = 100
LEARNING_RATE = 2e-3


dataset_size = dataset.cardinality().numpy()
train_dataset_size = int(dataset_size * TRAIN_SPLIT_SIZE)
val_dataset_size = dataset_size - train_dataset_size



train_dataset = dataset.take(train_dataset_size)\
                    .shuffle(SHUFFLE_SIZE)\
                    .batch(BATCH_SIZE)

val_dataset = dataset.skip(train_dataset_size)\
                    .batch(BATCH_SIZE)


# model = LanguageModel(tokenizer.total_tokens)
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
loss_fn = tf.keras.losses.BinaryCrossentropy()

train_loss = tf.keras.metrics.Mean(name='train_loss')
val_loss = tf.keras.metrics.Mean(name='train_loss')

@tf.function
def train_step(focus_word, context_word, label):
    with tf.GradientTape() as tape:
        output = model(focus_word, context_word, training=True)
        loss = loss_fn(label, output)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)


@tf.function
def val_step(focus_word, context_word, label):
    output = model(focus_word, context_word, training=True)
    loss = loss_fn(label, output)
    val_loss(loss)


# Main Training Loop
for epoch in range(EPOCHS):
    train_loss.reset_states()
    val_loss.reset_states()

    # Train Loop
    train_loop = tqdm(total=train_dataset.cardinality().numpy(), position=0, leave=True)
    for focus_word, context_word, label in train_dataset:
        train_step(focus_word, context_word, label)
        train_loop.set_description('Train - Epoch: {}, Loss: {:.4f}'.format(epoch, train_loss.result()))
        train_loop.update(1)
    train_loop.close()

    # Validation Loop
    val_loop = tqdm(total=val_dataset.cardinality().numpy(), position=0, leave=True)
    for focus_word, context_word, label in val_dataset:
        val_step(focus_word, context_word, label)
        val_loop.set_description('Val   - Epoch: {}, Loss: {:.4f}'.format(epoch, val_loss.result()))
        val_loop.update(1)
    val_loop.close()

Train - Epoch: 0, Loss: 0.5686: 100%|██████████| 598/598 [02:07<00:00,  4.69it/s]
0it [00:05, ?it/s]
Train - Epoch: 1, Loss: 0.4995: 100%|██████████| 598/598 [02:11<00:00,  4.56it/s]
0it [00:05, ?it/s]
Train - Epoch: 2, Loss: 0.4745: 100%|██████████| 598/598 [02:08<00:00,  4.65it/s]
0it [00:05, ?it/s]
Train - Epoch: 3, Loss: 0.4583: 100%|██████████| 598/598 [02:09<00:00,  4.63it/s]
0it [00:05, ?it/s]
Train - Epoch: 4, Loss: 0.4465: 100%|██████████| 598/598 [02:09<00:00,  4.63it/s]
0it [00:05, ?it/s]
Train - Epoch: 5, Loss: 0.4376: 100%|██████████| 598/598 [02:12<00:00,  4.51it/s]
0it [00:05, ?it/s]
Train - Epoch: 6, Loss: 0.4307: 100%|██████████| 598/598 [02:08<00:00,  4.66it/s]
0it [00:05, ?it/s]
Train - Epoch: 7, Loss: 0.4253: 100%|██████████| 598/598 [02:09<00:00,  4.62it/s]
0it [00:05, ?it/s]
Train - Epoch: 8, Loss: 0.4210: 100%|██████████| 598/598 [02:08<00:00,  4.66it/s]
0it [00:05, ?it/s]
Train - Epoch: 9, Loss: 0.4176: 100%|██████████| 598/598 [02:08<00:00,  4.65it/s]
0it [00:0

KeyboardInterrupt: ignored

In [None]:
tf.keras.layers.Embedding(tokenizer.total_tokens, output_dim=512, input_length=2)(tf.constant([1255, 28966], dtype=tf.int32))

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[ 0.01976549,  0.04826181, -0.01614144, ..., -0.0477993 ,
         0.00367948, -0.00162051],
       [ 0.02253098,  0.03827525, -0.03255246, ...,  0.00961969,
         0.04651088, -0.018433  ]], dtype=float32)>

In [15]:
tokenizer.texts_to_sequences('jean mary')

[[20871, 29765]]

In [38]:
embedding_weights = model.embedding.get_weights()

np.save('french_embedding_weights', embedding_weights)

In [41]:
embedding_weights_reloaded = np.load('french_embedding_weights.npy')

new_embedding = tf.keras.layers.Embedding(tokenizer.total_tokens, 128, mask_zero=True, weights=embedding_weights_reloaded)

new_embedding(tf.constant([1], dtype=tf.int32))

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-0.01672193, -0.0382713 ,  0.02569241,  0.01471088, -0.00602205,
         0.00716635, -0.01385874, -0.00165684, -0.04858153, -0.00328467,
        -0.02989974, -0.0123291 , -0.02526921,  0.01982469,  0.01533956,
         0.00945605,  0.03954988,  0.02423957, -0.02601607,  0.03208761,
        -0.02730244,  0.0046377 ,  0.03404119, -0.03013763, -0.00308994,
         0.04550319, -0.03893144, -0.00115867, -0.00341796, -0.01423977,
        -0.04445725,  0.03110829, -0.04505843,  0.03882005, -0.03786334,
        -0.01588675, -0.02065988, -0.03719121, -0.02261881, -0.00511817,
         0.00354328, -0.04019163,  0.00448092, -0.02389852, -0.04665628,
         0.04998109,  0.0340411 , -0.00846102, -0.00019098,  0.01156087,
        -0.00913759,  0.00788335,  0.01099713, -0.01221796, -0.02866399,
         0.03300605, -0.01870582,  0.004679  , -0.04505099,  0.01431693,
         0.0286878 , -0.01841231,  0.04636187, -0.03951782,  0.01553818,
 

In [None]:
loss_fn([0], [0.1])

<tf.Tensor: shape=(), dtype=float32, numpy=0.10536041>

In [16]:
model(tf.constant([20871]), tf.constant([29765]))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.9957212]], dtype=float32)>

In [None]:
tokenizer.texts_to_sequences([['Jean', 'Mary']])

In [None]:
model(tokenizer.t)

In [32]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [34]:
!cp french_embedding_weights.npy 'drive/My Drive/'

### Character-Level Results

Observe the results by generating text one character at a time.

Run this code block if you chose the character-level dataset

In [None]:
input = tf.constant([197])
string_output = ''
k = 2
model.gru.reset_states()
for _ in range(200):  # Max number of iterations
    output = model(input)
    char_idx = np.random.choice(tf.math.top_k(output, k=k).indices.numpy()[0])
    if char_idx == 198:
        break
    string_output += mapper.idx_to_char(char_idx)
    input = tf.constant([char_idx])

print(string_output)

c'une dux-sevatier à civellie he Mremen de querarancisquin maite de Stint née àa Marine mandien Avels neur en sept ans, tors apons du secour en,, dé laven apés neufante du sorre et de la cinq hons som


### Word-Level Results

Observe the results by generating text one word at a time.

Run this code block if you chose the word-level dataset.

In [None]:
input = tf.constant([1042])  # Start token
k = 30
model.gru.reset_states()
sequences = []
for _ in range(15):
    output = model(input)
    char_idx = np.random.choice(tf.math.top_k(output, k=k).indices.numpy()[0])
    if char_idx == 1043:
        break
    sequences.append(char_idx)

print(tokenizer.sequences_to_texts([sequences]))

['huit la mil en deux deux à Francois trente quatre la en deux à cinq']
