<a href="https://colab.research.google.com/github/annmariyaes/IoT-Data-Analysis---ML/blob/main/Text_generation_with_an_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import TensorFlow and other libraries**

In [None]:
import tensorflow as tf

import numpy as np
import os
import time

**Download the Shakespeare dataset**

In [None]:
path_to_file = tf.keras.utils.get_file('reddit_apple_android_2000.txt', 'https://raw.githubusercontent.com/minimaxir/textgenrnn/master/datasets/reddit_apple_android_2000.txt')

Downloading data from https://raw.githubusercontent.com/minimaxir/textgenrnn/master/datasets/reddit_apple_android_2000.txt


**Read the data**

In [None]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

Length of text: 147500 characters


In [None]:
# Take a look at the first 250 characters in text
print(text[:250])

title
The Apple Watch feature I once thought was a throwaway novelty is now crucial to me.
Apple is trying to limit how often your iPhone apps can bug you to give them a rating
The App Store now requires developers to use the official API to reque


In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

129 unique characters


**Process the text**

In [None]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [None]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  '\r':   1,
  ' ' :   2,
  '!' :   3,
  '"' :   4,
  '#' :   5,
  '$' :   6,
  '%' :   7,
  '&' :   8,
  "'" :   9,
  '(' :  10,
  ')' :  11,
  '*' :  12,
  '+' :  13,
  ',' :  14,
  '-' :  15,
  '.' :  16,
  '/' :  17,
  '0' :  18,
  '1' :  19,
  ...
}


In [None]:
# Show how the first 13 characters from the text are mapped to integers
print('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'title\r\nThe Ap' ---- characters mapped to int ---- > [83 72 83 75 68  1  0 52 71 68  2 33 79]


**Create training examples and targets**

In [None]:
# The maximum length sentence you want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

t
i
t
l
e


In [None]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'title\r\nThe Apple Watch feature I once thought was a throwaway novelty is now crucial to me.\r\nApple is'
' trying to limit how often your iPhone apps can bug you to give them a rating\r\nThe App Store now requ'
'ires developers to use the official API to request app ratings. Custom prompts are not allowed.\r\nI bu'
'ilt a custom Reddit TouchBar interface!\r\nTIL that if Apple finds an underage worker in a factory of a'
' supplier, They make the supplier return the child to their home, pay for their education, and pay fo'


In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'title\r\nThe Apple Watch feature I once thought was a throwaway novelty is now crucial to me.\r\nApple i'
Target data: 'itle\r\nThe Apple Watch feature I once thought was a throwaway novelty is now crucial to me.\r\nApple is'


In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 83 ('t')
  expected output: 72 ('i')
Step    1
  input: 72 ('i')
  expected output: 83 ('t')
Step    2
  input: 83 ('t')
  expected output: 75 ('l')
Step    3
  input: 75 ('l')
  expected output: 68 ('e')
Step    4
  input: 68 ('e')
  expected output: 1 ('\r')


**Create training batches**

In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

**Build The Model**

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

**Try the model**

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 129) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           33024     
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_1 (Dense)              (64, None, 129)           132225    
Total params: 4,103,553
Trainable params: 4,103,553
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
sampled_indices

array([ 90,  88, 101,  24,  35,  47,  11,  84, 127,   7,  95,  30,  52,
       114, 101, 105,  22,  63,  35,  26,  30,   3,   3, 128,  77, 125,
        55,  92,  51, 116,  65,  10,  52,  35,  23,   3,  98,   1,  99,
       104, 102,  60, 127,  84,  76,  93,  39,  42,  72,  79,  11,  13,
        39,  27, 111, 100,  82,  87,   7,  21,  96, 105,  11, 118, 122,
        71, 119,  46, 122,  89,   8,  53,  48,  49,  59,  72,  29, 102,
        76,  89, 127, 128, 102,  46,   2,  80,  37, 110, 103,  20,  68,
        80,  91, 110,  70,  11,  74, 127,  21, 101])

In [None]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'ackup\r\nIn case you were wondering, The Galaxy S8’s Always-on home button won’t burn-in\r\nSamsung will'

Next Char Predictions: 
 '|y\u200b6CO)u\ufeff%£=T″\u200b‘4`C8=!!�n㎃W\x80S€b(TC5!è\ré—‑\\\ufeffum\x93GJip)+G9\u202aಠsx%3¯‘)⇧✔h≈N✔z&UPQ[i;‑mz\ufeff�‑N qE…–2eq~…g)k\ufeff3\u200b'


**Train the model**

In [None]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 129)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.8596144


In [None]:
model.compile(optimizer='adam', loss=loss)

**Configure checkpoints**

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

**Execute the training**

In [None]:
EPOCHS = 10

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Generate the text**

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            33024     
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_2 (Dense)              (1, None, 129)            132225    
Total params: 4,103,553
Trainable params: 4,103,553
Non-trainable params: 0
_________________________________________________________________


**The Predication Summary**

In [None]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: …ツ:5 Pleat mack: ishole ass fontert to customize Kodm'
Ebersiging tol mocont"'s usnet
Apple Shentis
Wherogbe with Appleald Pixtaled oring op redericaly RAgPle adr'tedat prain ifforele
Scarie 5.33
Apple Mcamterts: I -iznow Whith 834 2018 iftil
Winde reanch than'("saner spoftchine thaby
Googha Appronen's sAppleading uroms iOS 110 Nea jacked foo Anroid -ntiod Phaserm Andrigid
isPhall Ganable of to ines, to icat
Vish With Mong Uss Couries an S7 sigre reerounding, issung Starims GIcrook toy PrDe, ane inated Bang approifilistablation for is to Apple Mustages repped
Androude hoved levine LiMfamp:
The's Hor 27 riotur shortidays gow Apple sontcod 4h-gayee
Reaplup unde's new ixel iPP. bo foured reweroit te perouccy infia cam juscass off 200ine excin a vinture in aly Sapplo des LEiPhopa Havew', ther chone wertatabe tiof jutin ".".
Ta is to 'neron in Banthrtu the sirfars op nethamed
Google Plakevs coun cagas focksurem wable iat vrot is upsing
OS rellengointado singet ressihnt

**Advanced: Customized Training**

In [None]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)



In [None]:
optimizer = tf.keras.optimizers.Adam()

In [None]:
@tf.function
def train_step(inp, target):
    with tf.GradientTape() as tape:
        predictions = model(inp)
        loss = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(
                target, predictions, from_logits=True))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss

In [None]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    # resetting the hidden state at the start of every epoch
    model.reset_states()

    for (batch_n, (inp, target)) in enumerate(dataset):
        loss = train_step(inp, target)

        if batch_n % 100 == 0:
            template = 'Epoch {} Batch {} Loss {}'
            print(template.format(epoch + 1, batch_n, loss))

    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_prefix.format(epoch=epoch))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, loss))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 4.860799789428711
Epoch 1 Loss 3.5946
Time taken for 1 epoch 117.60727190971375 sec

Epoch 2 Batch 0 Loss 3.5842597484588623
Epoch 2 Loss 3.1275
Time taken for 1 epoch 115.94977259635925 sec

Epoch 3 Batch 0 Loss 3.090139150619507
Epoch 3 Loss 2.8029
Time taken for 1 epoch 116.42076849937439 sec

Epoch 4 Batch 0 Loss 2.814554452896118
Epoch 4 Loss 2.6522
Time taken for 1 epoch 118.30856108665466 sec

Epoch 5 Batch 0 Loss 2.660689353942871
Epoch 5 Loss 2.5688
Time taken for 1 epoch 113.33480215072632 sec

Epoch 6 Batch 0 Loss 2.5623369216918945
Epoch 6 Loss 2.4719
Time taken for 1 epoch 116.50770401954651 sec

Epoch 7 Batch 0 Loss 2.521909236907959
Epoch 7 Loss 2.3924
Time taken for 1 epoch 118.69168257713318 sec

Epoch 8 Batch 0 Loss 2.431535482406616
Epoch 8 Loss 2.3977
Time taken for 1 epoch 123.91789102554321 sec

Epoch 9 Batch 0 Loss 2.353118896484375
Epoch 9 Loss 2.2645
Time taken for 1 epoch 130.6286175251007 sec

Epoch 10 Batch 0 Loss 2.294604778289795
Epoch