In [3]:
import nltk
import numpy as np
import requests
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.corpus import treebank, brown, conll2000
from sklearn.model_selection import train_test_split
from tensorflow import keras

In [None]:
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

### PoS tagger with Bidirectional LSTM

We are using the universal tagset so that different corpora will have the same PoS label

In [None]:
tagged_sentences = treebank.tagged_sents(tagset='universal') +\
                    brown.tagged_sents(tagset='universal') +\
                    conll2000.tagged_sents(tagset='universal')

print(tagged_sentences[0])
print(f"Dataset size: {len(tagged_sentences)}")

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]
Dataset size: 72202


In [None]:
sentences, sentence_tags = [], []

for s in tagged_sentences:
    sentence, tags = zip(*s)
    sentences.append(list(sentence))
    sentence_tags.append(list(tags))

In [None]:
print(sentences[0])
print(sentence_tags[0])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


In [None]:
print(len(sentences), len(sentence_tags))

72202 72202


In [None]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

X_train, X_test, y_train, y_test = train_test_split(sentences, sentence_tags,
                                                    test_size = 1 - train_ratio,
                                                    random_state=1)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                test_size=test_ratio/(test_ratio + validation_ratio),
                                                random_state=1)

In [None]:
print(len(X_train), len(y_train))
print(len(X_val), len(y_val))
print(len(X_test), len(y_test))

54151 54151
10830 10830
7221 7221


In [None]:
sentence_tokenizer = keras.preprocessing.text.Tokenizer(oov_token = "<OOV>")

In [None]:
sentence_tokenizer.fit_on_texts(X_train)

In [None]:
print(f"Vocabulary size: {len(sentence_tokenizer.word_index)}")

Vocabulary size: 52041


In [None]:
# We also have to create a second tokenizer to tokenize our labels
tag_tokenizer = keras.preprocessing.text.Tokenizer()
tag_tokenizer.fit_on_texts(y_train)

In [None]:
print(f"Number of PoS tags: {len(tag_tokenizer.word_index)}\n")
tag_tokenizer.get_config()

Number of PoS tags: 12



{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 54151,
 'word_counts': '{"det": 126968, "verb": 174593, "adj": 80523, "adp": 136453, "noun": 286676, "adv": 51205, ".": 142935, "pron": 44684, "conj": 35060, "num": 21461, "prt": 31229, "x": 6090}',
 'word_docs': '{"adv": 29531, "adp": 43855, "det": 44747, "verb": 50837, ".": 53332, "adj": 36344, "noun": 51171, "conj": 24383, "num": 11964, "pron": 26965, "prt": 21777, "x": 2682}',
 'index_docs': '{"7": 29531, "4": 43855, "5": 44747, "2": 50837, "3": 53332, "6": 36344, "1": 51171, "9": 24383, "11": 11964, "8": 26965, "10": 21777, "12": 2682}',
 'index_word': '{"1": "noun", "2": "verb", "3": ".", "4": "adp", "5": "det", "6": "adj", "7": "adv", "8": "pron", "9": "conj", "10": "prt", "11": "num", "12": "x"}',
 'word_index': '{"noun": 1, "verb": 2, ".": 3, "adp": 4, "det": 5, "adj": 6, "adv": 7, "pron": 8, "conj": 9, "prt": 10, "

In [None]:
tag_tokenizer.word_index

{'noun': 1,
 'verb': 2,
 '.': 3,
 'adp': 4,
 'det': 5,
 'adj': 6,
 'adv': 7,
 'pron': 8,
 'conj': 9,
 'prt': 10,
 'num': 11,
 'x': 12}

In [None]:
X_train_seqs = sentence_tokenizer.texts_to_sequences(X_train)

In [None]:
print(X_train_seqs[0])

[27, 86, 21, 479, 7, 2, 920, 10903, 20547, 3327, 5644, 337, 4]


In [None]:
print(f"Original: {X_train[0]}")
print(f"Reconstructed: {sentence_tokenizer.sequences_to_texts([X_train_seqs[0]])}")

Original: ['This', 'may', 'be', 'due', 'to', 'the', 'heavy', 'interlobular', 'connective', 'tissue', 'barriers', 'present', '.']
Reconstructed: ['this may be due to the heavy interlobular connective tissue barriers present .']


In [None]:
y_train_seqs = tag_tokenizer.texts_to_sequences(y_train)

In [None]:
print(f"Original: {y_train[0]}")
print(f"Reconstructed: {tag_tokenizer.sequences_to_texts([y_train_seqs[0]])}")

Original: ['DET', 'VERB', 'VERB', 'ADJ', 'ADP', 'DET', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'NOUN', 'ADV', '.']
Reconstructed: ['det verb verb adj adp det adj adj adj noun noun adv .']


In [None]:
X_val_seqs = sentence_tokenizer.texts_to_sequences(X_val)
y_val_seqs = tag_tokenizer.texts_to_sequences(y_val)

In [None]:
MAX_LENGTH = len(max(X_train_seqs, key=len))
print(f"Length of longest input sequence: {MAX_LENGTH}")

Length of longest input sequence: 161


In [None]:
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train_seqs, padding="post", maxlen=MAX_LENGTH)

In [None]:
print(X_train_padded[0])

[   27    86    21   479     7     2   920 10903 20547  3327  5644   337
     4     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]


In [None]:
y_train_padded = keras.preprocessing.sequence.pad_sequences(y_train_seqs, padding="post", maxlen=MAX_LENGTH)

In [None]:
X_val_padded = keras.preprocessing.sequence.pad_sequences(X_val_seqs, padding="post", maxlen=MAX_LENGTH)
y_val_padded = keras.preprocessing.sequence.pad_sequences(y_val_seqs, padding="post", maxlen=MAX_LENGTH)

In [None]:
y_train_categoricals = keras.utils.to_categorical(y_train_padded)

In [None]:
print(y_train_categoricals[0])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [None]:
print(y_train_categoricals[0][0])

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
idx = np.argmax(y_train_categoricals[0][0])
print(f"index: {idx}")

print(f"Tag: {tag_tokenizer.index_word[idx]}")

index: 5
Tag: det


In [None]:
y_val_categoricals = keras.utils.to_categorical(y_val_padded)

In [None]:
num_tokens = len(sentence_tokenizer.word_index) + 1
embedding_dim = 128

num_classes = len(tag_tokenizer.word_index) + 1

In [None]:
tf.random.set_seed(0)

model = keras.Sequential()
model.add(layers.Embedding(input_dim=num_tokens,
                            output_dim=embedding_dim,
                            input_length=MAX_LENGTH,
                            mask_zero=True))

model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True,
                                            kernel_initializer=tf.keras.initializers.random_normal(seed=1))))

model.add(layers.Dense(num_classes, activation='softmax',
                        kernel_initializer=tf.keras.initializers.random_normal(seed=1)))

model.compile(loss="categorical_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 161, 128)          6661376   
                                                                 
 bidirectional (Bidirection  (None, 161, 256)          263168    
 al)                                                             
                                                                 
 dense (Dense)               (None, 161, 13)           3341      
                                                                 
Total params: 6927885 (26.43 MB)
Trainable params: 6927885 (26.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)

history = model.fit(X_train_padded, y_train_categoricals, epochs=20,
                    batch_size=256, validation_data=(X_val_padded, y_val_categoricals),
                    callbacks=[es_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [None]:
X_test_seqs = sentence_tokenizer.texts_to_sequences(X_test)
X_test_padded = keras.preprocessing.sequence.pad_sequences(X_test_seqs, padding='post', maxlen=MAX_LENGTH)

y_test_seqs = tag_tokenizer.texts_to_sequences(y_test)
y_test_padded = keras.preprocessing.sequence.pad_sequences(y_test_seqs, padding='post', maxlen=MAX_LENGTH)
y_test_categoricals = keras.utils.to_categorical(y_test_padded)

In [None]:
model.evaluate(X_test_padded, y_test_categoricals)



[0.10088780522346497, 0.9698078036308289]

In [None]:
samples = [
    "Brown refused to testify.",
    "Brown sofas are on sale.",
]

In [None]:
def tag_sentences(sentences):
    sentences_seqs = sentence_tokenizer.texts_to_sequences(sentences)
    sentences_padded = keras.preprocessing.sequence.pad_sequences(sentences_seqs, padding="post", maxlen=MAX_LENGTH)

    tag_preds = model.predict(sentences_padded)
    sentence_tags = []

    for i, preds in enumerate(tag_preds):

        #Extracting tags for only non-padded tokens
        tags_seq = [np.argmax(p) for p in preds[:len(sentences_seqs[i])]]

        words = [sentence_tokenizer.index_word[w] for w in sentences_seqs[i]]
        tags = [tag_tokenizer.index_word[t] for t in tags_seq]

        sentence_tags.append(list(zip(words, tags)))

    return sentence_tags

In [None]:
tagged_sample_sentences = tag_sentences(samples)



In [None]:
print(tagged_sample_sentences[0])

[('brown', 'noun'), ('refused', 'verb'), ('to', 'prt'), ('testify', 'verb')]


In [None]:
print(tagged_sample_sentences[1])

[('brown', 'adj'), ('sofas', 'noun'), ('are', 'verb'), ('on', 'adp'), ('sale', 'noun')]


### Language modelling with Stacked LSTMs

In [12]:
art_of_war = requests.get('https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/art_of_war.txt')\
                     .text

art_of_war[:300]

'1. Sun Tzŭ said: The art of war is of vital importance to the State.\n\n2. It is a matter of life and death, a road either to safety or to\nruin. Hence it is a subject of inquiry which can on no account be\nneglected.\n\n3. The art of war, then, is governed by five constant factors, to be\ntaken into accou'

In [13]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

In [14]:
tokenizer.fit_on_texts([art_of_war])

In [15]:
tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 1,
 'word_counts': '{"1": 179, ".": 896, " ": 9794, "s": 3081, "u": 1467, "n": 3565, "t": 4398, "z": 20, "\\u016d": 13, "a": 3475, "i": 3573, "d": 1681, ":": 48, "h": 2558, "e": 5837, "r": 2776, "o": 3548, "f": 1238, "w": 981, "v": 478, "l": 1722, "m": 1201, "p": 769, "c": 1390, "\\n": 1443, "2": 127, ",": 634, "y": 1055, "b": 708, "j": 23, "q": 55, "g": 1007, "3": 87, "k": 345, "\\u2019": 57, "4": 66, "(": 59, ")": 59, ";": 168, "5": 58, "6": 51, "_": 62, "7": 39, "8": 36, "9": 34, "0": 38, "x": 49, "\\u2014": 16, "?": 8, "!": 8, "-": 57, "\\u201c": 3, "\\u201d": 3, "\\u0153": 7, "\\u00fc": 3, "\\u2018": 1}',
 'word_docs': '{"e": 1, "d": 1, "x": 1, "\\u201d": 1, "\\u2018": 1, "v": 1, "7": 1, "y": 1, "8": 1, "t": 1, "l": 1, "u": 1, "\\n": 1, ".": 1, "j": 1, "s": 1, "g": 1, ")": 1, "w": 1, "9": 1, ":": 1, "-": 1, "q": 1, "6": 

In [16]:
print(f"Tokenizer \"Vocabulary\" size: {len(tokenizer.word_index)}")

Tokenizer "Vocabulary" size: 56


In [17]:
seq = tokenizer.texts_to_sequences([art_of_war])[0]

In [18]:
print(f"Text length: {len(seq)}")

Text length: 61054


In [19]:
# Sanity check.
tokenizer.sequences_to_texts([seq[:10]])

['1 .   s u n   t z ŭ']

We will be converting our dataset to tensorflow data so that we can perform operations on it in a easier manner

In [20]:
slices = tf.data.Dataset.from_tensor_slices(seq)
type(slices)

In [21]:
list(slices.take(10))

[<tf.Tensor: shape=(), dtype=int32, numpy=27>,
 <tf.Tensor: shape=(), dtype=int32, numpy=21>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=int32, numpy=8>,
 <tf.Tensor: shape=(), dtype=int32, numpy=13>,
 <tf.Tensor: shape=(), dtype=int32, numpy=5>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=int32, numpy=3>,
 <tf.Tensor: shape=(), dtype=int32, numpy=47>,
 <tf.Tensor: shape=(), dtype=int32, numpy=49>]

In [22]:
seq[:10]

[27, 21, 1, 8, 13, 5, 1, 3, 47, 49]

We have to create input features and labels for that we will be using a windows approach

In [23]:
input_timesteps = 100
# We are implementing teacher forcing and hence we are using the +1
window_size = input_timesteps + 1
windows = slices.window(window_size, shift=1, drop_remainder=True)

In [24]:
for w in windows.take(3):
  arr = list(w.as_numpy_iterator())
  print(len(arr), arr)

101 [27, 21, 1, 8, 13, 5, 1, 3, 47, 49, 1, 8, 7, 4, 12, 41, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 1, 4, 8, 1, 6, 16, 1, 25, 4, 3, 7, 11, 1, 4, 17, 22, 6, 9, 3, 7, 5, 15, 2, 1, 3, 6, 1, 3, 10, 2, 1, 8, 3, 7, 3, 2, 21, 14, 14, 29, 21, 1, 4, 3, 1, 4, 8, 1, 7, 1, 17, 7, 3, 3, 2, 9, 1, 6, 16, 1, 11, 4, 16, 2, 1, 7, 5, 12, 1, 12]
101 [21, 1, 8, 13, 5, 1, 3, 47, 49, 1, 8, 7, 4, 12, 41, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 1, 4, 8, 1, 6, 16, 1, 25, 4, 3, 7, 11, 1, 4, 17, 22, 6, 9, 3, 7, 5, 15, 2, 1, 3, 6, 1, 3, 10, 2, 1, 8, 3, 7, 3, 2, 21, 14, 14, 29, 21, 1, 4, 3, 1, 4, 8, 1, 7, 1, 17, 7, 3, 3, 2, 9, 1, 6, 16, 1, 11, 4, 16, 2, 1, 7, 5, 12, 1, 12, 2]
101 [1, 8, 13, 5, 1, 3, 47, 49, 1, 8, 7, 4, 12, 41, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 1, 4, 8, 1, 6, 16, 1, 25, 4, 3, 7, 11, 1, 4, 17, 22, 6, 9, 3, 7, 5, 15, 2, 1, 3, 6, 1, 3, 10, 2, 1, 8, 3, 7, 3, 2, 21, 14, 14, 29, 21, 1, 4, 3, 1, 4, 8, 1, 7, 1, 17, 7, 3, 3, 2, 9, 1, 6, 16, 1, 11, 4, 16, 2, 1, 7, 5, 12, 1, 12, 2

In [25]:
print(windows, '\n')

for w in windows.take(2):
  print(w)

<_WindowDataset element_spec=DatasetSpec(TensorSpec(shape=(), dtype=tf.int32, name=None), TensorShape([]))> 

<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>


In [26]:
# To feed our model inputs in desirable format we are using the below code
dataset = windows.flat_map(lambda window: window.batch(window_size))

In [27]:
for d in dataset.take(2):
  print(d)

tf.Tensor(
[27 21  1  8 13  5  1  3 47 49  1  8  7  4 12 41  1  3 10  2  1  7  9  3
  1  6 16  1 20  7  9  1  4  8  1  6 16  1 25  4  3  7 11  1  4 17 22  6
  9  3  7  5 15  2  1  3  6  1  3 10  2  1  8  3  7  3  2 21 14 14 29 21
  1  4  3  1  4  8  1  7  1 17  7  3  3  2  9  1  6 16  1 11  4 16  2  1
  7  5 12  1 12], shape=(101,), dtype=int32)
tf.Tensor(
[21  1  8 13  5  1  3 47 49  1  8  7  4 12 41  1  3 10  2  1  7  9  3  1
  6 16  1 20  7  9  1  4  8  1  6 16  1 25  4  3  7 11  1  4 17 22  6  9
  3  7  5 15  2  1  3  6  1  3 10  2  1  8  3  7  3  2 21 14 14 29 21  1
  4  3  1  4  8  1  7  1 17  7  3  3  2  9  1  6 16  1 11  4 16  2  1  7
  5 12  1 12  2], shape=(101,), dtype=int32)


In [28]:
batch_size = 32

In [29]:
batches = dataset.shuffle(10000).batch(batch_size)

In [30]:
for b in batches.take(2):
  print(b)

tf.Tensor(
[[ 5 19  1 ... 48 14 14]
 [ 9  2  2 ...  7 12 25]
 [12  1  4 ...  2  1 23]
 ...
 [10  4  8 ... 10  6 13]
 [ 9  1 10 ...  5 15  2]
 [ 7  9 12 ...  4  8  1]], shape=(32, 101), dtype=int32)
tf.Tensor(
[[17  7  5 ...  2  1 22]
 [ 8 10  1 ...  7 12  2]
 [16  1 20 ...  2 12 14]
 ...
 [12  1 12 ...  3  9  4]
 [15  6 17 ...  9  2  1]
 [ 5  2  1 ...  7 15 26]], shape=(32, 101), dtype=int32)


In [31]:
xy_batches = batches.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [32]:
for b in xy_batches.take(1):
  print(b)

(<tf.Tensor: shape=(32, 100), dtype=int32, numpy=
array([[ 1,  3, 10, ...,  9, 19,  2],
       [ 3,  7,  9, ...,  7,  1, 23],
       [ 1, 23,  2, ...,  2,  7, 25],
       ...,
       [18, 24,  1, ..., 40, 10,  7],
       [ 2,  8, 14, ...,  7, 15,  3],
       [ 1, 16,  4, ..., 20,  7, 11]], dtype=int32)>, <tf.Tensor: shape=(32, 100), dtype=int32, numpy=
array([[ 3, 10,  6, ..., 19,  2,  9],
       [ 7,  9, 18, ...,  1, 23,  7],
       [23,  2,  1, ...,  7, 25, 18],
       ...,
       [24,  1,  3, ..., 10,  7, 13],
       [ 8, 14,  6, ..., 15,  3, 13],
       [16,  4,  2, ...,  7, 11, 11]], dtype=int32)>)


In [33]:
for b in xy_batches.take(1):
  print("x1 length: ", len(b[0][0].numpy()))
  print("x1: ", b[0][0].numpy())
  print("\n")
  print("y1 length: ", len(b[1][0].numpy()))
  print("y1: ", b[1][0].numpy())

x1 length:  100
x1:  [26  1  3  6  1  4  9  9  4  3  7  3  2  1 10  4 17 21 14 22  9  2  3  2
  5 12  1  3  6  1 23  2  1 20  2  7 26 24  1  3 10  7  3  1 10  2  1 17
  7 18  1 19  9  6 20  1  7  9  9  6 19  7  5  3 21 14 14 29 30 21  1  4
 16  1 10  2  1  4  8  1  3  7 26  4  5 19  1 10  4  8  1  2  7  8  2 24
  1 19  4 25]


y1 length:  100
y1:  [ 1  3  6  1  4  9  9  4  3  7  3  2  1 10  4 17 21 14 22  9  2  3  2  5
 12  1  3  6  1 23  2  1 20  2  7 26 24  1  3 10  7  3  1 10  2  1 17  7
 18  1 19  9  6 20  1  7  9  9  6 19  7  5  3 21 14 14 29 30 21  1  4 16
  1 10  2  1  4  8  1  3  7 26  4  5 19  1 10  4  8  1  2  7  8  2 24  1
 19  4 25  2]


In [34]:
num_tokens = len(tokenizer.word_index) + 1

# We are one hot encoding the inputs, but not the labels (that will be handled by the loss function)
xy_batches = xy_batches.map(lambda inputs, labels: (tf.one_hot(inputs, depth=num_tokens), labels))

In [35]:
for b in xy_batches.take(1):
  print("x1: ", b[0][0].numpy())
  print("\n")
  print("y1: ", b[1][0].numpy())

x1:  [[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


y1:  [25  4 15  3  6  9  4  6 13  8 28  1 10  2  1 20 10  6  1 26  5  6 20  8
  1  3 10  2 17  1  5  6  3  1 20  4 11 11  1 16  7  4 11 21 14 14 27 29
 21  1  3 10  2  9  2 16  6  9  2 24  1  4  5  1 18  6 13  9  1 12  2 11
  4 23  2  9  7  3  4  6  5  8 24  1 20 10  2  5  1  8  2  2 26  4  5 19
  1  3  6  1]


In [36]:
# This is an optimization step where we fetch the data from the next batch while the current batch is being processed
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [37]:
model = keras.models.Sequential()

model.add(layers.LSTM(128, return_sequences=True, input_shape=[None, num_tokens], recurrent_dropout=0.2))
model.add(layers.LSTM(128, return_sequences=True, input_shape=[None, num_tokens], recurrent_dropout=0.2))

model.add(layers.Dense(num_tokens, activation='softmax'))

# Sparse categorical crossentropy enables us to not one hot encode the labels
model.compile(loss="sparse_categorical_crossentropy", optimizer='adam')


In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, None, 128)         95232     
                                                                 
 lstm_1 (LSTM)               (None, None, 128)         131584    
                                                                 
 dense (Dense)               (None, None, 57)          7353      
                                                                 
Total params: 234169 (914.72 KB)
Trainable params: 234169 (914.72 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
filepath="./ArtofWarLM/training1/cp.ckpt"

# We want to save our training progress after every epoch, so we are setting up model checkpoint callback
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
# history = model.fit(xy_batches, epochs=50, callbacks=[cp_callback])

Epoch 1/50
   1905/Unknown - 986s 514ms/step - loss: 2.1409
Epoch 1: saving model to ./ArtofWarLM/training1/cp.ckpt
Epoch 2/50
Epoch 2: saving model to ./ArtofWarLM/training1/cp.ckpt
Epoch 3/50
Epoch 3: saving model to ./ArtofWarLM/training1/cp.ckpt
Epoch 4/50
Epoch 4: saving model to ./ArtofWarLM/training1/cp.ckpt
Epoch 5/50
Epoch 5: saving model to ./ArtofWarLM/training1/cp.ckpt
Epoch 6/50
Epoch 6: saving model to ./ArtofWarLM/training1/cp.ckpt
Epoch 7/50
  38/1905 [..............................] - ETA: 16:34 - loss: 1.1019

In [None]:
model.save('art_of_war_char_level_lm')

In [1]:
!wget https://github.com/futuremojo/nlp-demystified/raw/main/models/art_of_war_char_level_lm.zip
!unzip -o art_of_war_char_level_lm.zip

--2024-06-04 15:58:43--  https://github.com/futuremojo/nlp-demystified/raw/main/models/art_of_war_char_level_lm.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/models/art_of_war_char_level_lm.zip [following]
--2024-06-04 15:58:43--  https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/models/art_of_war_char_level_lm.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2691531 (2.6M) [application/zip]
Saving to: ‘art_of_war_char_level_lm.zip’


2024-06-04 15:58:44 (34.9 MB/s) - ‘art_of_war_char_level_lm.zip’ saved [2691531/2691531]

Archive:  art_of

In [43]:
model = keras.models.load_model('art_of_war_char_level_lm')

In [44]:
def generate_text(model, tokenizer, seed_text, num_chars=200, temperature=1):
  text = seed_text

  for _ in range(num_chars):
    input = np.array(tokenizer.texts_to_sequences([text[-input_timesteps:]]))
    input = tf.one_hot(input, num_tokens)

    preds = model.predict(input)[0, -1:]
    preds = tf.math.log(preds) / temperature

    next_char = tf.random.categorical(preds, num_samples=1)
    next_char = tokenizer.sequences_to_texts(next_char.numpy())[0]

    text += next_char

  return text

In [45]:
%%time
print(generate_text(model, tokenizer, "Banana peels on the battlefield can", num_chars=300, temperature=0.2))

Banana peels on the battlefield can never come again
into being; nor can the dead ever be brought back to life.

22. hence the enlightened ruler lays his plans well ahead;
the good general cultivates his resources.

17. move not unless you see an advantage; use not your troops unless
there is something to be gained; fight not unless 
CPU times: user 42.7 s, sys: 948 ms, total: 43.7 s
Wall time: 38.4 s
