In [1]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn import functional as F

In [2]:
q1 = "Well done is better than well said".lower()
q2 = "Better slip with foot than tongue".lower()
q3 = "There never was a good war or a bad peace".lower()
quotes = [q1, q2, q3]
quotes

['well done is better than well said',
 'better slip with foot than tongue',
 'there never was a good war or a bad peace']

In [3]:
s1 = 0
s2 = 1
s3 = 2
sources = [s1, s2, s3]
sources

[0, 1, 2]

In [4]:
chars = ['<pad>', '<eos>', '<unk>'] + list(set(' '.join(quotes)))
nb_char = len(chars)

print(f'There are {nb_char} unique characters')
chars

There are 23 unique characters


['<pad>',
 '<eos>',
 '<unk>',
 'c',
 'f',
 'w',
 'a',
 'p',
 'h',
 'v',
 'l',
 'r',
 ' ',
 'u',
 'i',
 'e',
 'g',
 'd',
 'n',
 's',
 'o',
 't',
 'b']

In [5]:
char_to_ix = { ch:i for i,ch in enumerate(chars)}
ix_to_char = { i:ch for i,ch in enumerate(chars)}

char_to_ix

{'<pad>': 0,
 '<eos>': 1,
 '<unk>': 2,
 'c': 3,
 'f': 4,
 'w': 5,
 'a': 6,
 'p': 7,
 'h': 8,
 'v': 9,
 'l': 10,
 'r': 11,
 ' ': 12,
 'u': 13,
 'i': 14,
 'e': 15,
 'g': 16,
 'd': 17,
 'n': 18,
 's': 19,
 'o': 20,
 't': 21,
 'b': 22}

In [6]:
x_seq = [torch.tensor(list(map(lambda char: char_to_ix[char], i))) for i in quotes]
[x.shape for x in x_seq]

[torch.Size([34]), torch.Size([33]), torch.Size([41])]

In [7]:
x_padded = pad_sequence(x_seq, batch_first=True, padding_value=char_to_ix['<pad>'])
[x.shape for x in x_padded]

[torch.Size([41]), torch.Size([41]), torch.Size([41])]

In [8]:
class QuoteDataset(Dataset):
    """Dataset for Summarizing Benjamin Franklin Quotes"""
    
    def __init__(self, quotes, sources):
        """
        Args:
            quotes (list(string)): A list of quotes
            sources (list(string)): A list of source ids
        """
        self.quotes = quotes
        self.sources = sources
        
        assert len(quotes) == len(sources), "The number of quotes must match the number of sources!"
        
    def __len__(self):
        return len(quotes)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        x_seq = torch.tensor(list(map(lambda char: char_to_ix[char], self.quotes[idx])))
        y = torch.tensor([self.sources[idx]])
            
        return (x_seq, y)
    
dset = QuoteDataset(quotes, sources)
dset[0]

(tensor([ 5, 15, 10, 10, 12, 17, 20, 18, 15, 12, 14, 19, 12, 22, 15, 21, 21, 15,
         11, 12, 21,  8,  6, 18, 12,  5, 15, 10, 10, 12, 19,  6, 14, 17]),
 tensor([0]))

In [9]:
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy = torch.tensor(yy)
    
    return xx_pad, yy, x_lens

dataset = QuoteDataset(quotes, sources)
data_loader = DataLoader(dataset=dataset, batch_size=2, shuffle=True, collate_fn=pad_collate)

In [10]:
embedding = nn.Embedding(nb_char, 100)
for i,(x_padded, y, x_lens) in enumerate(data_loader):
    x_embed = embedding(x_padded)
    print(x_embed.shape)
    print(y)

torch.Size([2, 34, 100])
tensor([0, 1])
torch.Size([1, 41, 100])
tensor([2])


In [11]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        embedding_dim = 100
        hidden_size = 100
        
        self.embedding = nn.Embedding(nb_char, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, len(sources))
        
    def forward(self, x, x_lens):
        x_embed = self.embedding(x)
        x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=True, enforce_sorted=False)
        output_packed, hidden = self.gru(x_packed)
        output_padded, output_lengths = pad_packed_sequence(output_packed, batch_first=True)
        output = self.fc_out(output_padded[:, -1, :])
        return output
        

In [14]:
rnn = RNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)
for epoch in range(10):
    for i,(x_padded, y, x_lens) in enumerate(data_loader):
        # This whole section below probably belongs in its own model
        optimizer.zero_grad()
        output = rnn(x_padded, x_lens)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        print(f"Batch Cross Entropy Loss: {loss.item()}")

Batch Cross Entropy Loss: 0.973617434501648
Batch Cross Entropy Loss: 1.0577738285064697
Batch Cross Entropy Loss: 0.8178512454032898
Batch Cross Entropy Loss: 0.7443614602088928
Batch Cross Entropy Loss: 0.7431101202964783
Batch Cross Entropy Loss: 0.4948538839817047
Batch Cross Entropy Loss: 0.7500364780426025
Batch Cross Entropy Loss: 1.5231322050094604
Batch Cross Entropy Loss: 0.6639506220817566
Batch Cross Entropy Loss: 0.240587517619133
Batch Cross Entropy Loss: 0.6594473719596863
Batch Cross Entropy Loss: 1.3392215967178345
Batch Cross Entropy Loss: 0.6280755996704102
Batch Cross Entropy Loss: 0.1610555350780487
Batch Cross Entropy Loss: 0.6162017583847046
Batch Cross Entropy Loss: 0.1374739110469818
Batch Cross Entropy Loss: 0.6053919792175293
Batch Cross Entropy Loss: 0.11427903920412064
Batch Cross Entropy Loss: 0.5964523553848267
Batch Cross Entropy Loss: 0.09237660467624664
