In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [52]:
import tensorflow as tf

In [53]:
path = 'shakespeare.txt'

In [54]:
txt = open(path, 'r').read()

In [55]:
len(txt)

5445609

In [56]:
type(txt)

str

In [57]:
print(txt[10000:10300])

 sullied night,
    And all in war with Time for love of you,
    As he takes from you, I engraft you new.


                     16  
  But wherefore do not you a mightier way
  Make war upon this bloody tyrant Time?
  And fortify your self in your decay
  With means more blessed than my barren rhy


In [58]:
len(set(txt))

84

In [59]:
#sorted(set(txt))

In [60]:
characters = sorted(set(txt))

In [61]:
char_to_index = {char:ind for ind, char in enumerate(characters)} 

In [62]:
char_to_index['s']

74

In [63]:
index_to_char = np.array(characters)

In [64]:
index_to_char[70]

'o'

In [65]:
t = ' i am student'

In [66]:
[i for i in t]

[' ', 'i', ' ', 'a', 'm', ' ', 's', 't', 'u', 'd', 'e', 'n', 't']

### now coverting all txt into the numeric


In [67]:

coded_text = np.array([char_to_index[c] for c in txt])

In [68]:
len(coded_text)

5445609

In [69]:
coded_text.shape

(5445609,)

## Creating batches

In [70]:
# lets consider 120 sequence length 

seq_len = 120
total_num_seq = len(txt)// (seq_len+1)

In [71]:
total_num_seq

45005

In [72]:
char_dataset = tf.data.Dataset.from_tensor_slices(coded_text)

In [73]:
char_dataset.take(10)

<TakeDataset shapes: (), types: tf.int32>

In [74]:
sequence = char_dataset.batch(seq_len+1, drop_remainder = True)

In [75]:
def create_seq_target(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt,target_txt

In [93]:
dataset = sequence.map(create_seq_target)

In [94]:
for input_txt,target_txt in dataset.take(1):
    print(input_txt.numpy())
    print(''.join(index_to_char[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    print(''.join(index_to_char[input_txt.numpy()])) 

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


In [95]:
batch_size = 128

In [96]:
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder= True)



In [97]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int32, tf.int32)>

## Creating a model
- set up loss function
- Create Model
  - Embedding 
  - GRU
  - Dense

In [98]:
vocab_size = len(set(txt)) 

In [99]:
embed_dim = 64  

In [100]:
rnn_neurons = 1026

In [101]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [103]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [102]:
def sparse_cat_losses(y_true, y_pred):
    return sparse_categorical_crossentropy(y_true,y_pred, from_logits=True)


In [105]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    
    model.add( Embedding(vocab_size,embed_dim,batch_input_shape = [batch_size, None]))
    
    model.add(GRU(rnn_neurons,return_sequences=True,
                 stateful = True, recurrent_initializer = 'glorot_uniform'))
    
    model.add(Dense(vocab_size))
    
    model.compile( 'adam', loss = sparse_cat_losses)
    
    return model

In [106]:
model = create_model( vocab_size= vocab_size,
                    embed_dim= embed_dim,
                    rnn_neurons= rnn_neurons,
                    batch_size=batch_size)

In [107]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


## Training the model 

In [None]:
# for input_example_batch, target_example_batch in dataset.take(1):

epochs = 10
model.fit(dataset, epochs = epochs)

Epoch 1/10