In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torch.utils.data import DataLoader, Subset

import matplotlib.pyplot as plt
from IPython import display
display.set_matplotlib_formats('svg')

In [3]:
!curl -O https://www.gutenberg.org/files/1268/1268-0.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1143k  100 1143k    0     0   299k      0  0:00:03  0:00:03 --:--:--  299k


In [4]:
with open('1268-0.txt', 'r', encoding = 'utf8') as fp:
  text = fp.read()
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
text = text[start_indx:end_indx]

In [5]:
char_set = set(text)
print(len(text), len(char_set))

1130711 85


In [6]:
chars_sorted = sorted(char_set)
chars_array = np.array(chars_sorted)
print(len(chars_array))

85


In [7]:
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
print(char2int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, '*': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '=': 27, '?': 28, 'A': 29, 'B': 30, 'C': 31, 'D': 32, 'E': 33, 'F': 34, 'G': 35, 'H': 36, 'I': 37, 'J': 38, 'K': 39, 'L': 40, 'M': 41, 'N': 42, 'O': 43, 'P': 44, 'Q': 45, 'R': 46, 'S': 47, 'T': 48, 'U': 49, 'V': 50, 'W': 51, 'X': 52, 'Y': 53, 'Z': 54, 'a': 55, 'b': 56, 'c': 57, 'd': 58, 'e': 59, 'f': 60, 'g': 61, 'h': 62, 'i': 63, 'j': 64, 'k': 65, 'l': 66, 'm': 67, 'n': 68, 'o': 69, 'p': 70, 'q': 71, 'r': 72, 's': 73, 't': 74, 'u': 75, 'v': 76, 'w': 77, 'x': 78, 'y': 79, 'z': 80, '‘': 81, '’': 82, '“': 83, '”': 84}


In [8]:
text_encoded = np.array([char2int[ch] for ch in text], dtype = np.int32)
print(text_encoded[:15], text[:15]) 
print(chars_array[text_encoded[:15]], chars_array[:15])

[48 36 33  1 41 53 47 48 33 46 37 43 49 47  1] THE MYSTERIOUS 
['T' 'H' 'E' ' ' 'M' 'Y' 'S' 'T' 'E' 'R' 'I' 'O' 'U' 'S' ' '] ['\n' ' ' '!' '"' '$' '%' '&' "'" '(' ')' '*' ',' '-' '.' '/']


In [9]:
for ex in text_encoded[:5]:
  print(f'{ex} : {chars_array[ex]}')

48 : T
36 : H
33 : E
1 :  
41 : M


In [10]:
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i: i+ chunk_size] for i in range(len(text_encoded)-chunk_size)]

In [11]:
text_chunks[:4]

[array([48, 36, 33,  1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47,
        40, 29, 42, 32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1,
        41, 53, 47, 48, 33, 46, 37], dtype=int32),
 array([36, 33,  1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47, 40,
        29, 42, 32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1, 41,
        53, 47, 48, 33, 46, 37, 43], dtype=int32),
 array([33,  1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47, 40, 29,
        42, 32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1, 41, 53,
        47, 48, 33, 46, 37, 43, 49], dtype=int32),
 array([ 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47, 40, 29, 42,
        32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1, 41, 53, 47,
        48, 33, 46, 37, 43, 49, 47], dtype=int32)]

In [12]:
from torch.utils.data import Dataset

In [13]:
class TextDataset(Dataset):
  def __init__(self, text_chunks):
    self.text_chunks = text_chunks
  
  def __len__(self):
    return len(self.text_chunks)
  
  def __getitem__(self, idx):
    text_chunk = self.text_chunks[idx]
    return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [14]:
for i, (seq, target) in enumerate(seq_dataset):
  print(f'Input: { repr("".join(chars_array[seq])) } ')
  print(f'Target: {repr("".join(chars_array[target]))}')
  if i == 2: break

Input: 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER' 
Target: 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Input: 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI' 
Target: 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'
Input: 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO' 
Target: ' MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIOU'


In [15]:
batch_size = 64
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle = True, drop_last=True)

In [16]:
class Rnn(nn.Module):
  def __init__(self, vocab_size, embed_dims, rnn_hidden_size, printtoggle=False):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dims)
    self.rnn_hidden_size = rnn_hidden_size
    self.rnn = nn.LSTM(embed_dims, rnn_hidden_size, batch_first = True)
    self.fc = nn.Linear(rnn_hidden_size, vocab_size)
    self.printtoggle = printtoggle
    
  def forward(self, x, hidden, cell):
    if self.printtoggle: print(f'X shape: {x.shape}')
    out = self.embedding(x).unsqueeze(1)
    if self.printtoggle: print(f'Embedding output shape: {out.shape}')
    if self.printtoggle: print(f'Embedding hidden, cell shape: {hidden.shape, cell.shape}')
    out, (hidden, cell) = self.rnn(out, (hidden, cell))
    if self.printtoggle: print(f'RNN-1 output shape: {out.shape, hidden.shape, cell.shape}')

    out = self.fc(out).reshape(out.size(0), -1)
    if self.printtoggle: print(f'Final output shape: {out.shape}')
    return out, hidden, cell 
    
  def init_hidden(self, batch_size):
    hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
    cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
    return hidden, cell



In [17]:
vocab_size = len(chars_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = Rnn(vocab_size, embed_dim, rnn_hidden_size, True)
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = .001)
print(model)

Rnn(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)


In [18]:
num_epochs = 5
for epoch in range(num_epochs):
  hidden, cell = model.init_hidden(seq_length)
  seq_batch, target_batch = next(iter(seq_dl))
  optimizer.zero_grad()
  loss = 0
  for c in range(batch_size):
    pred, hidden, cell = model(seq_batch[c, :], hidden, cell)
    loss += loss_fun(pred, target_batch[c, :])
  loss.backward()
  optimizer.step()
  loss = loss.item()/seq_length
  if epoch % 500 == 0:
    print(f'Epoch {epoch} loss: {loss:.4f}')

X shape: torch.Size([40])
Embedding output shape: torch.Size([40, 1, 256])
Embedding hidden, cell shape: (torch.Size([1, 40, 512]), torch.Size([1, 40, 512]))
RNN-1 output shape: (torch.Size([40, 1, 512]), torch.Size([1, 40, 512]), torch.Size([1, 40, 512]))
Final output shape: torch.Size([40, 85])
X shape: torch.Size([40])
Embedding output shape: torch.Size([40, 1, 256])
Embedding hidden, cell shape: (torch.Size([1, 40, 512]), torch.Size([1, 40, 512]))
RNN-1 output shape: (torch.Size([40, 1, 512]), torch.Size([1, 40, 512]), torch.Size([1, 40, 512]))
Final output shape: torch.Size([40, 85])
X shape: torch.Size([40])
Embedding output shape: torch.Size([40, 1, 256])
Embedding hidden, cell shape: (torch.Size([1, 40, 512]), torch.Size([1, 40, 512]))
RNN-1 output shape: (torch.Size([40, 1, 512]), torch.Size([1, 40, 512]), torch.Size([1, 40, 512]))
Final output shape: torch.Size([40, 85])
X shape: torch.Size([40])
Embedding output shape: torch.Size([40, 1, 256])
Embedding hidden, cell shape: (

In [45]:
from torch.distributions.categorical import Categorical


In [62]:
def sample(model, starting_str,len_generated_text=500,scale_factor=1.0):
  encoded_input = torch.tensor( [char2int[s] for s in starting_str])
  encoded_input = torch.reshape(encoded_input, (1, -1))
  generated_str = starting_str
  model.eval()
  hidden, cell = model.init_hidden(1)
  for c in range(len(starting_str)-1):
    _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)
  last_char = encoded_input[:, -1]
  for i in range(len_generated_text):
    logits, hidden, cell = model(last_char.view(1), hidden, cell)
    logits = torch.squeeze(logits, 0)
    scaled_logits = logits * scale_factor
    m = Categorical(logits=scaled_logits)
    last_char = m.sample()
    generated_str += str(chars_array[last_char])
  return generated_str

In [63]:
print(sample(model, starting_str='The island'))

The islande ged s!”
d o i acon--bere hists t, laffoa ald

forepen e the tre, in
fleg oulsare recrr’s waymondit wind t werug iceblopererid titwanof at ang Herin.
aved ther imethere the pthofictor-fer caive me hintofowalen Sowhas!” Theera is re has Noforoslaislifthintls banowhe t, ckexererincurevee “Werr, oulas by; that, QUngof Mbexilve, wo la Hal

me t roulererysl w s streapamonthes tee berepinet be rber ane. sarod pa tol Theng hipre d g ha amelffinomed cked tupors. hera we h g il wey f anowerapt ‘Ong oree


In [57]:
print(sample(model, starting_str='The island'))

RuntimeError: ignored

In [40]:
str1 = 'The island'
print(str1, encoded_input)
for c in range(len(str1) - 1):
  print(encoded_input[:, c])
  print(encoded_input[:, c].view(1))

The island tensor([[48, 62, 59,  1, 63, 73, 66, 55, 68, 58]])
tensor([48])
tensor([48])
tensor([62])
tensor([62])
tensor([59])
tensor([59])
tensor([1])
tensor([1])
tensor([63])
tensor([63])
tensor([73])
tensor([73])
tensor([66])
tensor([66])
tensor([55])
tensor([55])
tensor([68])
tensor([68])


In [37]:
encoded_input = torch.tensor([char2int[s] for s in str1])
print(str1)
print(encoded_input)
print(encoded_input.shape)
encoded_input = torch.reshape(encoded_input, (1, -1))
print(encoded_input.shape)
print(encoded_input)

The island
tensor([48, 62, 59,  1, 63, 73, 66, 55, 68, 58])
torch.Size([10])
torch.Size([1, 10])
tensor([[48, 62, 59,  1, 63, 73, 66, 55, 68, 58]])


In [51]:
seq_batch, target_batch = next(iter(seq_dl))
for c in range(seq_length):
  print(f'Input: { repr("".join(chars_array[seq_batch[c, :]])) } ')
  print(f'Target: {repr("".join(chars_array[target_batch[c, :]]))}')
  print(seq_batch[c,:], seq_batch[c,:].shape)

Input: 'Hemisphere. But this forest was only com' 
Target: 'emisphere. But this forest was only comp'
tensor([36, 59, 67, 63, 73, 70, 62, 59, 72, 59, 13,  1, 30, 75, 74,  1, 74, 62,
        63, 73,  1, 60, 69, 72, 59, 73, 74,  1, 77, 55, 73,  1, 69, 68, 66, 79,
         1, 57, 69, 67]) torch.Size([40])
Input: 'o place in the bare,\nstraight cliff, cou' 
Target: ' place in the bare,\nstraight cliff, coul'
tensor([69,  1, 70, 66, 55, 57, 59,  1, 63, 68,  1, 74, 62, 59,  1, 56, 55, 72,
        59, 11,  0, 73, 74, 72, 55, 63, 61, 62, 74,  1, 57, 66, 63, 60, 60, 11,
         1, 57, 69, 75]) torch.Size([40])
Input: 'al Project Gutenberg-tm electronic work ' 
Target: 'l Project Gutenberg-tm electronic work i'
tensor([55, 66,  1, 44, 72, 69, 64, 59, 57, 74,  1, 35, 75, 74, 59, 68, 56, 59,
        72, 61, 12, 74, 67,  1, 59, 66, 59, 57, 74, 72, 69, 68, 63, 57,  1, 77,
        69, 72, 65,  1]) torch.Size([40])
Input: 's under which the bodies had been found,' 
Target: ' under which the bodies ha