In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn import functional as F

In [2]:
q1 = "Well done is better than well said".lower()
q2 = "Better slip with foot than tongue".lower()
q3 = "There never was a good war or a bad peace".lower()
quotes = [q1, q2, q3]
quotes

['well done is better than well said',
 'better slip with foot than tongue',
 'there never was a good war or a bad peace']

In [3]:
s1 = "Ben Franklin".lower()
s2 = "Be very careful of what you say".lower()
s3 = "War is never good and peace is never bad".lower()
sources = [s1, s2, s3]
sources

['take action instead of speaking',
 'be very careful of what you say',
 'war is never good and peace is never bad']

In [4]:
chars = ['<pad>', '<eos>', '<unk>'] + list(set(' '.join(quotes + summaries)))
nb_char = len(chars)

print(f'There are {nb_char} unique characters')
chars

There are 25 unique characters


['<pad>',
 '<eos>',
 '<unk>',
 'e',
 'd',
 'y',
 'i',
 's',
 'c',
 't',
 'u',
 'o',
 'v',
 'a',
 'w',
 'n',
 'r',
 'h',
 'l',
 'k',
 'p',
 ' ',
 'b',
 'g',
 'f']

In [5]:
char_to_ix = { ch:i for i,ch in enumerate(chars)}
ix_to_char = { i:ch for i,ch in enumerate(chars)}

char_to_ix

{'<pad>': 0,
 '<eos>': 1,
 '<unk>': 2,
 'e': 3,
 'd': 4,
 'y': 5,
 'i': 6,
 's': 7,
 'c': 8,
 't': 9,
 'u': 10,
 'o': 11,
 'v': 12,
 'a': 13,
 'w': 14,
 'n': 15,
 'r': 16,
 'h': 17,
 'l': 18,
 'k': 19,
 'p': 20,
 ' ': 21,
 'b': 22,
 'g': 23,
 'f': 24}

In [6]:
x_seq = [torch.tensor(list(map(lambda char: char_to_ix[char], i))) for i in quotes + summaries]
[x.shape for x in x_seq]

[torch.Size([34]),
 torch.Size([33]),
 torch.Size([41]),
 torch.Size([31]),
 torch.Size([31]),
 torch.Size([40])]

In [7]:
x_padded = pad_sequence(x_seq, batch_first=True, padding_value=char_to_ix['<pad>'])
[x.shape for x in x_padded]

[torch.Size([41]),
 torch.Size([41]),
 torch.Size([41]),
 torch.Size([41]),
 torch.Size([41]),
 torch.Size([41])]

In [8]:
class BenFranklinSummaryDataset(Dataset):
    """Dataset for Summarizing Benjamin Franklin Quotes"""
    
    def __init__(self, quotes, summaries):
        """
        Args:
            quotes (list(string)): A list of quotes
            summaries (list(string)): A list of summaries
        """
        self.quotes = quotes
        self.summaries = summaries
        
        assert len(quotes) == len(summaries), "The number of quotes must match the number of summaries!"
        
    def __len__(self):
        return len(quotes)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        x_seq = torch.tensor(list(map(lambda char: char_to_ix[char], self.quotes[idx])))
        y_seq = torch.tensor(list(map(lambda char: char_to_ix[char], self.summaries[idx])))
            
        return (x_seq, y_seq)
    
dset = BenFranklinSummaryDataset(quotes, summaries)
dset[0]

(tensor([14,  3, 18, 18, 21,  4, 11, 15,  3, 21,  6,  7, 21, 22,  3,  9,  9,  3,
         16, 21,  9, 17, 13, 15, 21, 14,  3, 18, 18, 21,  7, 13,  6,  4]),
 tensor([ 9, 13, 19,  3, 21, 13,  8,  9,  6, 11, 15, 21,  6, 15,  7,  9,  3, 13,
          4, 21, 11, 24, 21,  7, 20,  3, 13, 19,  6, 15, 23]))

In [9]:
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)
    
    return xx_pad, yy_pad, x_lens, y_lens

dataset = BenFranklinSummaryDataset(quotes, summaries)
data_loader = DataLoader(dataset=dataset, batch_size=2, shuffle=True, collate_fn=pad_collate)

In [10]:
embedding = nn.Embedding(nb_char, 100)
for i,(x_padded, y_padded, x_lens, y_lens) in enumerate(data_loader):
    x_embed = embedding(x_padded)
    print(x_embed.shape)

torch.Size([2, 41, 100])
torch.Size([1, 33, 100])


In [11]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        embedding_dim = 100
        hidden_size = 100
        
        self.embedding = nn.Embedding(nb_char, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, nb_char)
        
    def forward(self, x, x_lens):
        x_embed = self.embedding(x)
        x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=True, enforce_sorted=False)
        output_packed, hidden = self.gru(x_packed)
        output_padded, output_lengths = pad_packed_sequence(output_packed, batch_first=True)
        output = self.fc_out(output_padded)
        return output
        

In [16]:
rnn = RNN()
for i,(x_padded, y_padded, x_lens, y_lens) in enumerate(data_loader):
    # This whole section below probably belongs in its own model
    output = rnn(x_padded, x_lens)
    print(output[0].shape)
    print(y_padded.shape)
    
    batch_ce_loss = 0.0
    for i in range(output.size(0)):
        ce_loss = F.cross_entropy(output[i], y_padded[i], reduction="sum", ignore_index=0)
        batch_ce_loss += ce_loss
    print(f"Batch Cross Entropy Loss: {batch_ce_lossce_loss}")

torch.Size([34, 25])
torch.Size([2, 31])


ValueError: Expected input batch_size (34) to match target batch_size (31).