# Installation and Setup

In [1]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [2]:
import string
import requests

In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >=0:
  print('Select the Runtime > " Change runtime type" menu to enable a GPU accelerator,')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Mon Mar  8 22:48:57 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
from psutil import virtual_memory
ram_gb = virtual_memory().total/1e9
print('Your  runtime has {:.1f} BG of Ram\n'.format(ram_gb))

if ram_gb < 20:
  print('Re-select the high-RAM runtime in the Runtime Menu.')
else:
  print('You are using high-RAM runtime!')

Your  runtime has 27.4 BG of Ram

You are using high-RAM runtime!


# Data Preprocessing

In [8]:
response = requests.get('http://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

In [9]:
response.text

Output hidden; open in https://colab.research.google.com to view.

In [10]:
# generate a list of strings
data = response.text.split('\n')

In [11]:

# get a look
data[0]

'This is the 100th Etext file presented by Project Gutenberg, and'

In [12]:
# data starts much later
data[253]

'  From fairest creatures we desire increase,'

In [13]:
data = data[253:]

In [14]:
data[0]

'  From fairest creatures we desire increase,'

In [15]:
len(data)

124204

In [16]:
# joing all the values now
# this list will be converted to text
data = ' '.join(data)
data

Output hidden; open in https://colab.research.google.com to view.

In [17]:
# remove punctuation
def clean_text(doc):
  tokens = doc.split()
  # remove the punctuation
  table = str.maketrans('','', string.punctuation)
  # list  without punctuation
  tokens = [(w.translate(table)) for w in tokens]
  # get only alpahnumeric chars
  tokens = [word for word  in tokens if word.isalpha()]
  # convert to lower
  tokens = [word.lower() for word in tokens]
  return tokens

In [18]:
tokens = clean_text(data)
print(tokens[:50])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [19]:
# combine the tokens together
len(tokens)

898199

In [20]:
# check unique words
len(set(tokens))

27956

# Create the Data Sequences
We will use a particular set of words to predict the next word

Use 50 words per segment to predict the next word

In [21]:
# set the length as 50 + num_to_predict
num_to_predict = 1

length = 50 + num_to_predict
lines = []

# this range will start from 51
for i in range(length, len(tokens)):
  # seq = 0 to 51 for first sequence
  seq = tokens[i-length:i]
  # join tokens to create a line
  line = ' '.join(seq)
  # append it
  lines.append(line)
  # work with the first 200K words to reduce training
  if i > 200000:
    break

In [22]:
print(len(lines))

199951


In [23]:
# look at a single sequence
lines[0]

'from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self'

In [24]:
tokens[0], tokens[50]

('from', 'self')

In [25]:
lines[1]

'fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self thy'

In [26]:
# shift one token
tokens[1], tokens[51]

('fairest', 'thy')

# Tokenization

In [27]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizing and Embedding
Fit the lines in tokenization then the sequence of words is embedded as an integer.  Each unique word is assigned to an integer.

The sequences will have a list of integer values created by the tokenizer.

In [28]:
# tokenization
# instantiate a Tokenizer object
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [None]:
# sequences

In [29]:
sequences = np.array(sequences)

In [30]:
sequences[0]

array([   47,  1408,  1264,    37,   451,  1406,     9,  2766,  1158,
        1213,   171,   132,   269,    20,    24,     1,  4782,    87,
          30,    98,  4781,    18,   715,  1263,   171,   211,    18,
         829,    20,    27,  3807,     4,   214,   121,  1212,   153,
       13004,    31,  2765,  1847,    16, 13003, 13002,   754,     7,
        3806,    99,  2430,   466,    31,   307])

# Create x and y

the first 50 words (x) will be used as an input vector, and the 51st word will be 
sequences is a two dimensional array, rows and columns
rows a = line of the text, columns, first 50 columns are the x and 51 columns is y

In [31]:
x, y = sequences[:,:-1],sequences[:,-1]

In [32]:
x[0]

array([   47,  1408,  1264,    37,   451,  1406,     9,  2766,  1158,
        1213,   171,   132,   269,    20,    24,     1,  4782,    87,
          30,    98,  4781,    18,   715,  1263,   171,   211,    18,
         829,    20,    27,  3807,     4,   214,   121,  1212,   153,
       13004,    31,  2765,  1847,    16, 13003, 13002,   754,     7,
        3806,    99,  2430,   466,    31])

In [33]:
x[0].shape

(50,)

In [34]:
y[0]

307

In [35]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'of': 5,
 'you': 6,
 'a': 7,
 'my': 8,
 'that': 9,
 'in': 10,
 'is': 11,
 'not': 12,
 'it': 13,
 'for': 14,
 'me': 15,
 'with': 16,
 'he': 17,
 'his': 18,
 'your': 19,
 'but': 20,
 'be': 21,
 'this': 22,
 'have': 23,
 'as': 24,
 'him': 25,
 'so': 26,
 'thou': 27,
 'will': 28,
 'what': 29,
 'by': 30,
 'thy': 31,
 'no': 32,
 'are': 33,
 'all': 34,
 'her': 35,
 'do': 36,
 'we': 37,
 'if': 38,
 'our': 39,
 'or': 40,
 'shall': 41,
 'thee': 42,
 'which': 43,
 'on': 44,
 'lord': 45,
 'o': 46,
 'from': 47,
 'good': 48,
 'more': 49,
 'sir': 50,
 'was': 51,
 'they': 52,
 'well': 53,
 'at': 54,
 'would': 55,
 'when': 56,
 'now': 57,
 'come': 58,
 'love': 59,
 'th': 60,
 'than': 61,
 'am': 62,
 'then': 63,
 'she': 64,
 'their': 65,
 'them': 66,
 'how': 67,
 'enter': 68,
 'let': 69,
 'did': 70,
 'ill': 71,
 'hath': 72,
 'one': 73,
 'us': 74,
 'know': 75,
 'first': 76,
 'make': 77,
 'had': 78,
 'like': 79,
 'here': 80,
 'upon': 81,
 'there': 82,
 'man': 83,


In [36]:
# length of total vocabulary
len(tokenizer.word_index)

13008

In [37]:
vocab_size = len(tokenizer.word_index) + 1 # add 1 for zero indexing

In [38]:
vocab_size

13009

In [39]:
# in original text, total number of unique words
# after data cleaning more condensed  to 13009
len(set(tokens))

27956

In [40]:
y = to_categorical(y, num_classes=vocab_size)

In [41]:
x.shape[1]



50

In [42]:
seq_length = x.shape[1]
seq_length

50

# Build the LSTM Model

In [43]:
model = Sequential()

# First embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length))

# First LSTM layer
model.add(LSTM(units=100, return_sequences=True))

# Second LSTM layer
model.add(LSTM(units=100))

# Dense layer
model.add(Dense(units=100, activation='relu'))

# final layer
model.add(Dense(units=vocab_size, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            650450    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 13009)             1313909   
Total params: 2,115,259
Trainable params: 2,115,259
Non-trainable params: 0
_________________________________________________________________


In [44]:
# compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [45]:
#train the model
model.fit(x,y, batch_size=256, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f06bd7e6890>

In [46]:
# create a seed line
lines[12343]

'home of love if i have ranged like him that travels i return again just to the time not with the time exchanged so that my self bring water for my stain never believe though in my nature reigned all frailties that besiege all kinds of blood that it could so'

In [47]:
seed_text = lines[12343]

In [54]:
# define a function for generation

def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
  text = []

  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([seed_text])[0] # zeorth dimension of the array
    encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating = 'pre')

    y_predict = model.predict_classes(encoded)

    predicted_word = ''
    for word, index in tokenizer.word_index.items():
      if index == y_predict:
        predicted_word = word
        break
    seed_text = seed_text + ' ' + predicted_word
    text.append(predicted_word)
  return ' '.join(text)

In [55]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 10)



'preposterously be stained to leave for the nonce octavia twixt'