<a href="https://colab.research.google.com/github/BYU-Handwriting-Lab/GettingStarted/blob/solution/notebooks/language-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Model

This notebook provides code to create a character-level language model in 
TensorFlow.

### Dependencies

Import the necessary dependencies and download our character set and corpus.

In [3]:
import tensorflow as tf

import string
import random
import re

import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
!wget -q https://raw.githubusercontent.com/ericburdett/named-entity-recognition/master/char_set.json
!wget -q --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ZsJ8cZSDU98GpcK-kl_Cq3eTt-R2YvSJ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1ZsJ8cZSDU98GpcK-kl_Cq3eTt-R2YvSJ" -O french_ner_dataset.csv && rm -rf /tmp/cookies.txt

In [3]:
# ID: 1M26Gpca8Ug4YvRLxoUDDCjMBeJtojITY
!wget -q --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1wDMLz9hTmfvPhkhCHTylbeAU6Utpkqb1' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1wDMLz9hTmfvPhkhCHTylbeAU6Utpkqb1" -O french_text.txt && rm -rf /tmp/cookies.tx

## Load the Corpus

Define some constants to help us know which characters are used for words and
which are used for punctuation/digits.

Load the corpus to be used for tokenization and dataset creation.

In [4]:
DEFAULT_CHARS = ' !"#$%&\'()*+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz|~£§¨«¬\xad' \
                '°²´·º»¼½¾ÀÂÄÇÈÉÊÔÖÜßàáâäæçèéêëìîïñòóôöøùúûüÿłŒœΓΖΤάήαδεηικλμνξοπρτυχψωόώІ‒–—†‡‰‹›₂₤℔⅓⅔⅕⅖⅗⅘⅙⅚⅛∆∇∫≠□♀♂✓ｆ'
# The default list of non-punctuation characters needed for the word beam search decoding algorithm
DEFAULT_NON_PUNCTUATION = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÂÄÇÈÉÊÔÖÜßàáâäæçèéêëìîïñòóôöøùúûüÿ' \
                          'łŒœΓΖΤάήαδεηικλμνξοπρτυχψωόώІ'

DEFAULT_PUNCTUATION = string.punctuation + '0123456789'

In [5]:
lines = open('french_text.txt', 'r', encoding='utf8').readlines()

french_words = []
for line in lines:
    french_words.extend(line.split())
french_words = ' '.join(french_words)

## Tokenization

One of the hardest parts is creating a good tokenization method.

This tokenizer will create a token for each word. Each punctuation or digit
character will have its own token.

In [6]:
class Tokenizer:
    def __init__(self, corpus, word_chars, punctuation, lower=False):
        self.word_chars = word_chars
        self.punctuation = punctuation
        self.regex = r"[" + self.word_chars + r"]+|[^\s]" 

        words = self.split(corpus)
        all_words_list = words + list(punctuation)
        all_words_list_unique = list(set(all_words_list))
        all_words = [' '.join(all_words_list_unique)]

        self.total_tokens = len(all_words_list_unique) + 2 # +2 to account for 0 (reserved) and 1 (OOV)
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.total_tokens, filters='', lower=lower, oov_token='<OOV>')
        self.tokenizer.fit_on_texts(all_words)

    def split(self, text):
        return re.findall(self.regex, text)

    def texts_to_sequences(self, text):
        words = self.split(text)
        return self.tokenizer.texts_to_sequences([' '.join(words)])

In [7]:
tokenizer = Tokenizer(french_words, DEFAULT_NON_PUNCTUATION, DEFAULT_PUNCTUATION, lower=False)
sentence = 'acte de deces-de..1832(hello)5eme'

print('Original Sentence:', sentence)
print('Split Sentence:', tokenizer.split(sentence))
print('Tokenized Sentence:', tokenizer.texts_to_sequences(sentence))

Original Sentence: acte de deces-de..1832(hello)5eme
Split Sentence: ['acte', 'de', 'deces', '-', 'de', '.', '.', '1', '8', '3', '2', '(', 'hello', ')', '5', 'eme']
Tokenized Sentence: [[33383, 33521, 3575, 38751, 33521, 23647, 23647, 17975, 4601, 457, 31340, 41614, 1, 5024, 33450, 41406]]


In [8]:
embedding = tf.keras.layers.Embedding(tokenizer.total_tokens, 1024)

sequence = tokenizer.texts_to_sequences(sentence)
sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=1)

tf.squeeze(embedding(sequence))

<tf.Tensor: shape=(1024,), dtype=float32, numpy=
array([-0.01646097,  0.03190211, -0.02221891, ...,  0.02684306,
        0.02210755,  0.01925501], dtype=float32)>

## Dataset Creation

Create the Tensorflow dataset using the tokenizer created above.

In [9]:
def create_context_pairs(words, window_size, negative_sample_size):
    focus_words = []
    context_words = []
    labels = []

    # Add positive samples
    for index in tqdm(range(len(words)), desc='Positive Samples'):

        # Grab words to the left:
        for i in range(1, window_size + 1):
            left_index = index - i
            if left_index >= 0:
                focus_words.append(words[index])
                context_words.append(words[left_index])
                labels.append(1)
        
        # Grab words to the right:
        for i in range(1, window_size + 1):
            right_index = index + i
            if right_index < len(words):
                focus_words.append(words[index])
                context_words.append(words[right_index])
                labels.append(1)               
    
    # Add negative samples
    for word in tqdm(words, desc='Negative Samples'):
        for i in range(negative_sample_size):
            index = random.randint(0, len(words) - 1)
            focus_words.append(word)
            context_words.append(words[index])
            labels.append(0)

    # Shuffle the dataset
    print('Shuffling dataset...')
    zipped = list(zip(focus_words, context_words, labels))
    random.shuffle(zipped)
    focus_words, context_words, labels = zip(*zipped)
    print('Done.')
    
    return tf.constant(focus_words, dtype=tf.int32), tf.constant(context_words, dtype=tf.int32), tf.constant(labels, dtype=tf.int32)

In [10]:
# Tokenize the entire corpus
tokenized_french_words = tokenizer.texts_to_sequences(french_words)[0]

# Create pairs of words that should be similar
focus_words, context_words, labels = create_context_pairs(tokenized_french_words, 3, 5)

# Create the dataset
dataset = tf.data.Dataset.from_tensor_slices((focus_words, context_words, labels))

Positive Samples: 100%|██████████| 1086647/1086647 [00:03<00:00, 330448.60it/s]
Negative Samples: 100%|██████████| 1086647/1086647 [00:10<00:00, 105845.78it/s]


Shuffling dataset...
Done.


In [11]:
# Show one batch of 100 words
for focus_word, context_word, label in dataset.batch(100).take(1):
    print(focus_word)
    print(context_word)
    print(label)

tf.Tensor(
[ 6220 32265 26996 36247  8593  3543 25441 38751 32265 40117 32644 28966
 32265 21783  9741 33521 38377  8094 38196  2280  9971 32051 33521 34232
 17770 41328  7097 27098  8718 23690 12659 41968 23258 38967  9971 17975
 17401 12131  9971 19565 17015  3575  5088 17975 28540 10746  6281 21050
  7413 30593 18125  6675 25859 32644  9230 29035 14308 42149 14251 16043
 17975 24481 32265 38967 38196 30593 33521 24097 28668  7477 14768 36029
 28035 23690  7363  1903 32265 31127  9035 41309 19559 33521 24322 25936
 11710 36247 17091  6194  9971 16798  2966 33521 38751 21050 21375  3575
 16756 41113 38377 33450], shape=(100,), dtype=int32)
tf.Tensor(
[12994 27098 14503 28966 35770  8008 26223 20821 21783  4099 20439 38196
  6220 25859 33042 10104 19681 18125 41490 27454 32644 37856 37651 19795
   736  7477  7363 21050 16019 15099  9971 33616 38196 26009 23690 40825
 33521  8805  5659 29035  4104 41328 19735   593 14267 31831  1430  3584
 31340 17975 25080 27609 22645 38377 19565 25859

### Model Creation

Build our simple model that includes an embedding layer, recurrent layer, and
dense layer to get us down to the number of classes.

In [85]:
class LanguageModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim=128):
        super(LanguageModel, self).__init__()

        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.context = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.dot = tf.keras.layers.Dot(axes=(1,1))
    
    def call(self, target, context, training=False):
        if training:
            target = self.embedding(target)
            context = self.context(context)
            dot = self.dot([target, context])
            out = tf.keras.activations.sigmoid(dot)
        else:
            target = self.embedding(target)
            context = self.embedding(context)
            dot = self.dot([target, context])
            out = tf.keras.activations.sigmoid(dot)

        return out

Test it out just to make sure it works.

In [46]:
model = LanguageModel(tokenizer.total_tokens)

loss_fn = tf.keras.losses.BinaryCrossentropy()

for focus_word, context_word, label in dataset.shuffle(5000).batch(5).take(1):
    output = model(focus_word, context_word)
    loss = loss_fn(label, output)
    print('Output Similarity Predictions:', output.numpy())
    print('Output Similarity Actual', label.numpy())
    print('Loss:', loss.numpy())

Output Similarity Predictions: [[0.50147766]
 [0.49982056]
 [0.4956222 ]
 [0.49872485]
 [0.49849376]]
Output Similarity Actual [1 1 1 1 0]
Loss: 0.69429654


### Train the Model

Train the model based on the text in our corpus.

The goal is to predict the next character. Thus, the target is the input tensor
rolled by one character.

In [54]:
dataset.cardinality().numpy() * 0.8

9562484.0

In [None]:
TRAIN_SPLIT_SIZE = 1.0
SHUFFLE_SIZE = 100_000
BATCH_SIZE = 25_000
EPOCHS = 100
LEARNING_RATE = 5e-3


dataset_size = dataset.cardinality().numpy()
train_dataset_size = int(dataset_size * TRAIN_SPLIT_SIZE)
val_dataset_size = dataset_size - train_dataset_size



train_dataset = dataset.take(train_dataset_size)\
                    .shuffle(SHUFFLE_SIZE)\
                    .batch(BATCH_SIZE)

val_dataset = dataset.skip(train_dataset_size)\
                    .batch(BATCH_SIZE)


# model = LanguageModel(tokenizer.total_tokens)
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
loss_fn = tf.keras.losses.BinaryCrossentropy()

train_loss = tf.keras.metrics.Mean(name='train_loss')
val_loss = tf.keras.metrics.Mean(name='train_loss')

@tf.function
def train_step(focus_word, context_word, label):
    with tf.GradientTape() as tape:
        output = model(focus_word, context_word, training=True)
        loss = loss_fn(label, output)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)


@tf.function
def val_step(focus_word, context_word, label):
    output = model(focus_word, context_word, training=True)
    loss = loss_fn(label, output)
    val_loss(loss)


# Main Training Loop
for epoch in range(EPOCHS):
    train_loss.reset_states()
    val_loss.reset_states()

    # Train Loop
    train_loop = tqdm(total=train_dataset.cardinality().numpy(), position=0, leave=True)
    for focus_word, context_word, label in train_dataset:
        train_step(focus_word, context_word, label)
        train_loop.set_description('Train - Epoch: {}, Loss: {:.4f}'.format(epoch, train_loss.result()))
        train_loop.update(1)
    train_loop.close()

    # Validation Loop
    val_loop = tqdm(total=val_dataset.cardinality().numpy(), position=0, leave=True)
    for focus_word, context_word, label in val_dataset:
        val_step(focus_word, context_word, label)
        val_loop.set_description('Val   - Epoch: {}, Loss: {:.4f}'.format(epoch, val_loss.result()))
        val_loop.update(1)
    val_loop.close()

In [92]:
tf.keras.layers.Embedding(tokenizer.total_tokens, output_dim=512, input_length=2)(tf.constant([1255, 28966], dtype=tf.int32))

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[ 0.01976549,  0.04826181, -0.01614144, ..., -0.0477993 ,
         0.00367948, -0.00162051],
       [ 0.02253098,  0.03827525, -0.03255246, ...,  0.00961969,
         0.04651088, -0.018433  ]], dtype=float32)>

In [98]:
tokenizer.texts_to_sequences('jean mary')

[[22097, 7923]]

In [77]:
loss_fn([0], [0.1])

<tf.Tensor: shape=(), dtype=float32, numpy=0.10536041>

In [99]:
model(tf.constant([22097]), tf.constant([7923]))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.99998677]], dtype=float32)>

In [None]:
tokenizer.texts_to_sequences([['Jean', 'Mary']])

In [None]:
model(tokenizer.t)

### Character-Level Results

Observe the results by generating text one character at a time.

Run this code block if you chose the character-level dataset

In [None]:
input = tf.constant([197])
string_output = ''
k = 2
model.gru.reset_states()
for _ in range(200):  # Max number of iterations
    output = model(input)
    char_idx = np.random.choice(tf.math.top_k(output, k=k).indices.numpy()[0])
    if char_idx == 198:
        break
    string_output += mapper.idx_to_char(char_idx)
    input = tf.constant([char_idx])

print(string_output)

c'une dux-sevatier à civellie he Mremen de querarancisquin maite de Stint née àa Marine mandien Avels neur en sept ans, tors apons du secour en,, dé laven apés neufante du sorre et de la cinq hons som


### Word-Level Results

Observe the results by generating text one word at a time.

Run this code block if you chose the word-level dataset.

In [None]:
input = tf.constant([1042])  # Start token
k = 30
model.gru.reset_states()
sequences = []
for _ in range(15):
    output = model(input)
    char_idx = np.random.choice(tf.math.top_k(output, k=k).indices.numpy()[0])
    if char_idx == 1043:
        break
    sequences.append(char_idx)

print(tokenizer.sequences_to_texts([sequences]))

['huit la mil en deux deux à Francois trente quatre la en deux à cinq']
