In [1]:
# First we will implement a simple RNN cell
# Then will train a very simple language model with it

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f9b99ddda50>

In [27]:
device = "cpu"

In [28]:
# I will restrict my notation according to Andrew Ng's DL lectures notation
class RNN(nn.Module):

    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()
        self.W_hh = torch.nn.Parameter(torch.randn((hidden_size, hidden_size)))
        self.W_xh = torch.nn.Parameter(torch.randn((input_size, hidden_size)))
        self.b_h = torch.nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x):
        h = torch.zeros(x.shape[0], self.W_hh.shape[1])
        for t in range(x.shape[1]):
            h_new = torch.tanh(torch.matmul(h, self.W_hh) + torch.matmul(x[:, t], self.W_xh) + self.b_h)
            h = h_new

        return h

In [29]:
rnn_cell = RNN(24, 32)

In [30]:
x = torch.randn((24, 32))
a = rnn_cell.forward(x)

In [31]:
a.shape

torch.Size([24, 32])

In [32]:
# Lets build a very simple LM model
class LM(nn.Module):

    def __init__(self, vocab_size, input_size, hidden_size):
        super(LM, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, input_size)
        self.rnn = RNN(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    
    def forward(self, x):
        x_emb = self.embeddings(x)
        h = self.rnn(x_emb)
        logits = self.fc(h)
        return logits

In [33]:
# We will use very simple dataset to train a LM
from nltk.corpus import gutenberg
shakespeare_works = gutenberg.fileids()
data = gutenberg.raw('shakespeare-hamlet.txt')
data = data.lower()
print(data[:100])

[the tragedie of hamlet by william shakespeare 1599]


actus primus. scoena prima.

enter barnardo a


In [34]:
chars = list(set(data))
vocab_size = len(chars)

def encoding(chars):
    chr_to_idx = {ch:i for i, ch in enumerate(chars)}
    return chr_to_idx

def decoding(chars):
    idx_to_chr = {i:ch for i, ch in enumerate(chars)}
    return idx_to_chr

In [35]:
chr_to_idx = encoding(chars)
idx_to_chr = decoding(chars)
data_idx = [chr_to_idx[ch] for ch in data]

In [36]:
train_size = int(0.8 * len(data_idx))
valid_size = int(0.1 * len(data_idx))
train_data = data_idx[:train_size]
valid_data = data_idx[train_size:train_size+valid_size]
test_data = data_idx[train_size+valid_size:]

In [37]:
def create_data_pairs(data, seq_len):
    inputs = []
    labels = []

    for i in range(0, len(data) - seq_len, seq_len):
        inputs.append(data[i: i+seq_len])
        labels.append(data[i+seq_len: i+seq_len+1])

    x = torch.tensor(inputs, dtype=torch.long)
    y = torch.tensor(labels, dtype=torch.long)

    return x, y

In [38]:
input_size = 16
hidden_size = 16
seq_length = 8
learning_rate = 0.01
batch_size = 32

In [39]:
train_x, train_y = create_data_pairs(train_data, seq_length)
valid_x, valid_y = create_data_pairs(valid_data, seq_length)
test_x, test_y = create_data_pairs(test_data, seq_length)

In [40]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(valid_x, valid_y)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [45]:
model = LM(vocab_size, input_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [46]:
def train_epoch(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0

    
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        #print("inputs", inputs.shape)
        #print("targets", targets.shape)
        # Reset the gradients
        optimizer.zero_grad()
        outputs = model(inputs)
        #print("outputs", outputs.shape)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)


In [47]:
def evaluate(model, data_loader, criterion):

    model.eval()
    val_loss = 0
    
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            
            val_loss += loss.item()
            
    return val_loss / len(data_loader)


In [48]:
n_epochs = 100
for epoch in range(n_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    print(f"Epoch [{epoch+1}/{n_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}")

Epoch [1/100], Train Loss: 2.8061, Validation Loss: 2.5604
Epoch [2/100], Train Loss: 2.4393, Validation Loss: 2.4223
Epoch [3/100], Train Loss: 2.3408, Validation Loss: 2.3973
Epoch [4/100], Train Loss: 2.2892, Validation Loss: 2.3636
Epoch [5/100], Train Loss: 2.2538, Validation Loss: 2.3337
Epoch [6/100], Train Loss: 2.2381, Validation Loss: 2.3398
Epoch [7/100], Train Loss: 2.2294, Validation Loss: 2.3000
Epoch [8/100], Train Loss: 2.2122, Validation Loss: 2.2927
Epoch [9/100], Train Loss: 2.2028, Validation Loss: 2.2907
Epoch [10/100], Train Loss: 2.2030, Validation Loss: 2.3024
Epoch [11/100], Train Loss: 2.1983, Validation Loss: 2.2830
Epoch [12/100], Train Loss: 2.1949, Validation Loss: 2.2829
Epoch [13/100], Train Loss: 2.1909, Validation Loss: 2.2927
Epoch [14/100], Train Loss: 2.1767, Validation Loss: 2.2880
Epoch [15/100], Train Loss: 2.1816, Validation Loss: 2.2886
Epoch [16/100], Train Loss: 2.1735, Validation Loss: 2.2944
Epoch [17/100], Train Loss: 2.1732, Validation Lo

In [49]:
test_loss = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}")

Test Loss: 2.2914


In [64]:
def generate_sequence(model, start_token, length=100):
    model.eval() 
    generated_sequence = [start_token]
    input_token = start_token
    
    
    with torch.no_grad():
        for _ in range(length):
            input_ix = torch.tensor([[chr_to_idx[input_token]]], dtype=torch.long).to(device)
            output = model(input_ix)
            #print(output.shape)
            
            _, predicted_ix = torch.max(output, 1)
            predicted_token = idx_to_chr[predicted_ix.item()]
            
            generated_sequence.append(predicted_token)
            input_token = predicted_token

    return ' '.join(generated_sequence)


In [None]:
generate_sequence(model, "m", 8)

In [75]:
torch.exp(-torch.tensor(test_loss)).item()

0.10112334787845612