In [1]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

### Get the data and process
- This is the Mysterious island found in Project Gutenberg.

In [3]:
## Reading and processing text
with open('data/mysterious_island.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

# Get the index of 'THE MYSTERIOUS ISLAND' or 'The Mysterious Island'
start_indx = text.find('THE MYSTERIOUS ISLAND') 
# Get the index of 'End of the Project Gutenberg'
end_indx = text.find('End of the Project Gutenberg')
# Set text to the text between start and end idx.
text = text[start_indx:end_indx] 
# Get the unique set of characters.
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))
assert(len(text) == 1130711)
assert(len(char_set) == 85)

Total Length: 1130711
Unique Characters: 85


### Tokenize and get other helpers
- We do this manually since everything is character based.

In [4]:
# The universe of words
chars_sorted = sorted(char_set)

# Effectively, these maps are the tokenizer.
# Map each char to a unique int
char2int = {char: i for i, char in enumerate(chars_sorted)} 
# Reverse
int2char = np.array(chars_sorted) 

# Tokenize the entire corpus
text_encoded = np.array([char2int[char] for char in text], dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


#### Test Examples

In [5]:
print('Text encoded shape: ', text_encoded.shape)
print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1130711,)
THE MYSTERIOUS       == Encoding ==>  [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32]  == Reverse  ==>  ISLAND


In [6]:
assert(
    np.array_equal(
    text_encoded[:15],
        [48, 36, 33, 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1]
    )
)

### Process the data and get the data loader

In [7]:
seq_length = 40
chunk_size = seq_length + 1

# Break up the data into chunks of size 41. This will be used to get (x, y) pairs.
text_chunks = [list(text_encoded[i:i+chunk_size]) for i in range(0, len(text_encoded)-41)]

In [8]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        # Get the text chunk at index idx.
        text_chunk = self.text_chunks[idx]
        # Return (x, y) where x has length 40 and y has length 40.
        # y should be x shifted by 1 time.
        return (text_chunk[:-1],text_chunk[1:])
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

In [9]:
for i, (seq, target) in enumerate(seq_dataset):
    # 40 characters for source and target
    print(seq.shape, target.shape)
    print('Input (x):', repr(''.join(int2char[seq])))
    print('Target (y):', repr(''.join(int2char[target])))
    print()
    if i == 1:
        break 

torch.Size([40]) torch.Size([40])
Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

torch.Size([40]) torch.Size([40])
Input (x): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y): 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [10]:
device = torch.device("cpu")

In [11]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Write the models

In [12]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        # Set to an embedding layer of vocab_size by embed_dim
        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim
        ) 
        self.rnn_hidden_size = rnn_hidden_size
        # Set to an LSTM with x having embed_dim and h dimension rnn_hidden_size
        self.rnn = nn.LSTM(
            embed_dim,
            rnn_hidden_size,
            batch_first=True
        )
        
        # Make a linear layer from rnn_hidden_size to vocab_size
        # This will be used to get the yt for each xt
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, text, hidden=None, cell=None):
        # Get the embeddings for text
        out = self.embedding(text)
        
        # Pass out, hidden and cell through the rnn
        if hidden is not None:
            out, (hidden, cell) = self.rnn(out, (hidden, cell)) 
        else:
            out, (hidden, cell) = self.rnn(out) 
        
        # Pass out through fc
        out = self.fc(out) 
        
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size) 
        cell =  torch.zeros(1, batch_size, self.rnn_hidden_size) 
        return hidden.to(device), cell.to(device)

In [13]:
vocab_size = len(int2char)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 10000

torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    
    # Get the next batch from seq_dl
    seq_batch, target_batch = next(iter(seq_dl))
        
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    
    optimizer.zero_grad()
    
    loss = 0

    # Pass through the model
    logits, _ = model(seq_batch, hidden, cell)
    # Get the loss
    loss += criterion(logits.view(logits.shape[0] * logits.shape[1], -1),
            target_batch.view(-1).long()) 
        
    # Back propagation
    loss.backward() 
    optimizer.step() 
    
    # Get the value in the tensor loss
    loss = loss.item() 
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.4368
Epoch 100 loss: 1.7288
Epoch 200 loss: 1.4757
Epoch 300 loss: 1.4383
Epoch 400 loss: 1.3761
Epoch 500 loss: 1.3576
Epoch 600 loss: 1.3231
Epoch 700 loss: 1.3780
Epoch 800 loss: 1.2898
Epoch 900 loss: 1.2814
Epoch 1000 loss: 1.2473
Epoch 1100 loss: 1.2786
Epoch 1200 loss: 1.2971
Epoch 1300 loss: 1.2105
Epoch 1400 loss: 1.2104
Epoch 1500 loss: 1.2749
Epoch 1600 loss: 1.2072
Epoch 1700 loss: 1.2126
Epoch 1800 loss: 1.2599
Epoch 1900 loss: 1.1995
Epoch 2000 loss: 1.1940
Epoch 2100 loss: 1.2404
Epoch 2200 loss: 1.2516
Epoch 2300 loss: 1.1620
Epoch 2400 loss: 1.2366
Epoch 2500 loss: 1.2003
Epoch 2600 loss: 1.1787
Epoch 2700 loss: 1.1795
Epoch 2800 loss: 1.2657
Epoch 2900 loss: 1.1573
Epoch 3000 loss: 1.1851
Epoch 3100 loss: 1.1391
Epoch 3200 loss: 1.2206
Epoch 3300 loss: 1.1572
Epoch 3400 loss: 1.1195
Epoch 3500 loss: 1.1588
Epoch 3600 loss: 1.1817
Epoch 3700 loss: 1.1273
Epoch 3800 loss: 1.1181
Epoch 3900 loss: 1.1720
Epoch 4000 loss: 1.1684
Epoch 4100 loss: 1.1902
Epoc

In [15]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[-1.0, 1.0, 3.0]])

# Get the probabilities for these logits
print('Probabilities:', Categorical(logits=logits).probs)

# Get a Categorical random variable with the above probabilities for each of the classes
m = Categorical(logits=logits)
# Generate 10 things
samples = m.sample((10,))
 
print(samples.numpy())

Probabilities: tensor([[0.0159, 0.1173, 0.8668]])
[[1]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [16]:
def random_sample(
    model,
    starting_str, 
    len_generated_text=500, 
):

    # Encode starting string into a tensor using char2str
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    
    encoded_input = encoded_input.unsqueeze(0) 
    
    generated_str = starting_str

    # Put model in eval mode
    model.eval() 
    
    hidden, cell = model.init_hidden(1)
    
    hidden = hidden.to(device)
    
    cell = cell.to(device)
        
    # Build up the starting hidden and cell states
    for c in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state
        out = encoded_input[0][c:c+1].unsqueeze(0) 
        # Pass out through, note we update hidden and cell and use them again
        _, (hidden, cell) = model(out, hidden, cell) 
    
    # Get the last char; note we did not do go to the last char above
    last_char = encoded_input[0][-1].unsqueeze(0).unsqueeze(0) 
    # Generate chars one at a time, add them to generated_str
    # Do this over and over until you get the desired length
    for i in range(len_generated_text):
        
        # Use hidden and cell from the above.
        # Use last_char, which will be updated over and over.
        logits, (hidden, cell) = model(last_char, hidden, cell)
        
        # Get the logits
        logits = logits.squeeze(0) 
        
        m = Categorical(logits=logits) #random variable with probabilities based on the softmax of the logits
        
        # Generate from m 1 char
        last_char = torch.tensor(m.sample().item()).unsqueeze(0).unsqueeze(0)
        
        # Add the generated char to generated_str
        generated_str += int2char[last_char.item()]
        
    return generated_str

torch.manual_seed(1)
model.to(device)
print(random_sample(model, starting_str='The island'))

The island in their day and the diffactured for a
clump of turn of trees, and the shore between the base in the islet, was refeind that by the
fire from their circumstances neither a
ship up name on an opening began to drew himself even either to warn it, making again streaks than crize, and
its warves of the paper of the phown were mingled. At the engineer, for, taking it in his companions had again sailed always although to take to the other clames’ heartle,
and took that this lucky could arranged in th
