In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tensorflow-addons
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)

from tensorflow import keras
assert tf.__version__ >= "2.0"


if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


In [2]:
import tensorflow_text as text
import sentencepiece as spm
from tensorflow.python.platform import gfile

shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()
#
#
# "".join(sorted(set(shakespeare_text.lower())))

spm.SentencePieceTrainer.Train('--input="/home/bhaskar/.keras/datasets/shakespeare.txt --model_prefix=shakespeare10k --vocab_size=10000')



In [3]:
def _utf8(tokens):
  if sys.version_info[0] == 2:
    return tokens
  if isinstance(tokens, list):
    return [_utf8(t) for t in tokens]
  else:
    return tokens.encode('utf-8')

In [4]:
# wordPieceTokenizer = text.WordpieceTokenizer()

sentencePieceTokenizer = text.SentencepieceTokenizer(
    model=gfile.GFile('shakespeare10k.model', 'rb').read(),
    nbest_size=0,       # 0 -> Forward filtering and backward sampling
    reverse=False       # False -> Empirically works better in case of reversed tokens
)


InternalError: CUDA runtime implicit initialization on GPU:0 failed. Status: out of memory

In [None]:
sentencePieceTokenizer.tokenize([['This', 'julius', 'it'], ['Brute', 'is', 'This']])

In [None]:
sentencePieceTokenizer.vocab_size("SentencepieceTokenizerVocabSize")

In [None]:
max_id = 40

# TODO:
# max_id = len(sentencePieceTokenizer.word_index) # number of distinct characters
# dataset_size = sentencePieceTokenizer.document_count # total number of characters

In [None]:
[encoded] = np.array(sentencePieceTokenizer.tokenize([shakespeare_text])) - 1
dataset_size = 10000
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [None]:
sentencePieceTokenizer.tokenize([['This', 'julius', 'it'], ['Brute', 'is', 'This']])

In [None]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [None]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
batch_size = 4096
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))


In [None]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [None]:
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

In [None]:
sentencePieceTokenizer.tokenize([['This', 'julius', 'it'], ['Brute', 'is', 'This']])

In [None]:

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

try:
    model = tf.keras.models.load_model('shakespeare_text_sentecePiece.h5')
except:
    history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10)
    model.save('shakespeare_text_sentecePiece.h5')

In [None]:
sentencePieceTokenizer.tokenize([['This', 'julius', 'it'], ['Brute', 'is', 'This']])
# sentencePieceTokenizer.tokenize(['How'])

In [None]:
def preprocess(texts):
    X = np.array(sentencePieceTokenizer.tokenize(texts)) - 1
    print(X)
    return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = model.predict_classes(X_new)
sentencePieceTokenizer.tokenize(Y_pred + 1)[0][-1] # 1st sentence, last char

In [None]:
tf.random.set_seed(42)

tf.random.categorical([[tf.math.log(0.5), tf.math.log(0.4), tf.math.log(0.1)]], num_samples=40).numpy()

In [None]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return sentencePieceTokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
tf.random.set_seed(42)

next_char("How are yo", temperature=1)

In [None]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
tf.random.set_seed(42)

print(complete_text("t", temperature=0.2))

In [None]:
print(complete_text("t", temperature=1))

In [None]:
print(complete_text("t", temperature=2))
