<a href="https://colab.research.google.com/github/baiyunming/I2DL-WS2020/blob/main/%E2%80%9CLecture_6_Recurrent_Neural_Network%E2%80%9D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

In [2]:
!wget http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt

--2021-01-08 07:18:24--  http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35751 (35K) [text/plain]
Saving to: ‘female.txt’


2021-01-08 07:18:24 (465 KB/s) - ‘female.txt’ saved [35751/35751]



In [3]:
device = 'cuda'

In [4]:
with open('female.txt', 'r') as f:
    lines = f.readlines()
names = []
max_len = 0
for l in lines[6:]:
    curr_name = l[:-1].lower()
    if curr_name.isalpha():
        #names.append(l[:-1].lower())
        names.append(curr_name)        
        max_len = max(len(names[-1]), max_len)
max_len += 1 # consider the 'EOS' (end of signal)
print('Maximum Length : ' + str(max_len))

Maximum Length : 14


In [None]:
names

In [6]:
class NameDataset(Dataset):
    def __init__(self, names, max_len):
        self.names = names
        self.max_len = max_len
        self.a_order = ord('a')  #character 
        self.z_order = ord('z') 
        self.num_classes = 26 + 1 # a-z + include the end of signal

    def __len__(self):
        return len(names)

    def __getitem__(self, idx):
        padding_name = [self.num_classes-1 for _ in range(self.max_len)]  #padding ensure same length
        curr_name = [ord(n)-self.a_order for n in names[idx]]
        padding_name[:len(curr_name)] = curr_name
        
        # Slide the input to make a output
        sample = dict()
        sample['input'] = torch.LongTensor(padding_name[:-1]) # h y e m i n  -1 -1 -1
        sample['output'] = torch.LongTensor(padding_name[1:]) # y e m i n -1 -1 -1 -1
        sample['length'] = len(names[idx])
        sample['original'] = names[idx]

        return sample

In [7]:
batch_size = 64
dataset = NameDataset(names, max_len)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [8]:
sample = next(iter(dataloader))
print(sample['input'][0])
print(sample['output'][0])
print(sample['length'][0])
print(sample['original'][0])
print(sample['input'].shape, sample['output'].shape)

tensor([ 5, 11, 14, 17,  8, 18, 26, 26, 26, 26, 26, 26, 26])
tensor([11, 14, 17,  8, 18, 26, 26, 26, 26, 26, 26, 26, 26])
tensor(6)
floris
torch.Size([64, 13]) torch.Size([64, 13])


In [10]:
# This could be useful with variable lengths
total_lengths = sample['length']
sort_length, sort_idx = torch.sort(total_lengths, descending=True)
sort_input = sample['input'][sort_idx]
sort_output = sample['output'][sort_idx]
print(sort_length)
print(sort_input.shape)

tensor([10,  9,  9,  9,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,
         7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  4,  4,  4,  4,  4,  4,  4,  4,  4])
torch.Size([64, 13])


In [11]:
class RNNmodel(nn.Module):
    def __init__(self, lstm_dim=256, num_classes=dataset.num_classes, max_len=max_len):
        super(RNNmodel, self).__init__()
        self.lstm_dim = lstm_dim
        self.num_classes = num_classes
        self.max_len = max_len
        self.char_embedding = nn.Embedding(num_embeddings=num_classes, embedding_dim=lstm_dim) # num_classes * lstm_dim
        self.lstm = nn.LSTM(input_size=lstm_dim, hidden_size=lstm_dim, num_layers=1, batch_first=True) 
        #(B T D) <- tensor/ B: Batch_size T: sequence length D: dimension of each element vector
        self.out_linear = nn.Linear(lstm_dim, num_classes)

    def forward(self, sort_input, sort_output, sort_length):
        ## originally, recommended to use torch.nn.utils.rnn.pack_padded_sequence,when we have variable lengths
        ## but in this case, I just neglected it because beginners can be more confused with this
        lstm_input = self.char_embedding(sort_input)
        lstm_out, (h, c) = self.lstm(lstm_input)
        out = self.out_linear(lstm_out)

        return out

    def test(self, start_char):
        generated_name = list()
        generated_name.append(start_char)

        start_order = torch.LongTensor([ord(start_char)]).to(device) - ord('a')
        start_order = start_order.reshape(1, 1)
        cnt = 0

        while cnt <= self.max_len:
            curr_embed = self.char_embedding(start_order)
            if cnt == 0:
                lstm_out, (h, c) = self.lstm(curr_embed)
            else:
                lstm_out, (h, c) = self.lstm(curr_embed, (h, c))
            out = self.out_linear(lstm_out)

            sample_next = torch.distributions.Categorical(logits = out[0, 0, :]).sample().item()
            if sample_next == 26:
                break
            else:
                generated_name.append(chr(ord('a')+sample_next))
                sample_next = torch.LongTensor([sample_next]).to(device)
                start_order = sample_next.reshape(1, 1)

                cnt += 1

        return ''.join(generated_name)


In [None]:
model = RNNmodel()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
def train(model, optimizer, sample):
    optimizer.zero_grad()
    criteria = nn.CrossEntropyLoss()

    total_lengths = sample['length']
    sort_length, sort_idx = torch.sort(total_lengths, descending=True)

    sort_input = sample['input'][sort_idx].to(device)
    sort_output = sample['output'][sort_idx].to(device)
    sort_length = sort_length.to(device)

    pred = model(sort_input, sort_output, sort_length) # B T C
    B, T, C = pred.shape
    
    curr_loss = criteria(pred.reshape(B*T, C), sort_output.reshape(B*T))

    curr_loss.backward()
    optimizer.step()

    return curr_loss.item()

In [None]:
max_epoch = 100
for epoch in tqdm(range(max_epoch)):
    total_loss = 0.0
    for sample in dataloader:
        curr_loss = train(model, optimizer, sample)
        total_loss += curr_loss / len(dataloader)

    start_char = chr(np.random.randint(ord('a'), ord('z')))
    print('[EPOCH {}] TRAIN LOSS: {}, SAMPLED NAME: {}'.format(epoch, total_loss, model.test(start_char)))


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[EPOCH 0] TRAIN LOSS: 1.8830124307901437, SAMPLED NAME: yzvvgtcwbdbapnfi
[EPOCH 1] TRAIN LOSS: 1.2530618921304353, SAMPLED NAME: yfndnha
[EPOCH 2] TRAIN LOSS: 1.1317325356679084, SAMPLED NAME: wiilit
[EPOCH 3] TRAIN LOSS: 1.0657492180665333, SAMPLED NAME: xpeh
[EPOCH 4] TRAIN LOSS: 1.021871271805885, SAMPLED NAME: ninse
[EPOCH 5] TRAIN LOSS: 0.9894907497442683, SAMPLED NAME: qqesisolle
[EPOCH 6] TRAIN LOSS: 0.9635974642557978, SAMPLED NAME: qobuyna
[EPOCH 7] TRAIN LOSS: 0.9425856471061705, SAMPLED NAME: yna
[EPOCH 8] TRAIN LOSS: 0.9253952258672469, SAMPLED NAME: kopina
[EPOCH 9] TRAIN LOSS: 0.9099500721845873, SAMPLED NAME: brijalone
[EPOCH 10] TRAIN LOSS: 0.8973082074752216, SAMPLED NAME: w
[EPOCH 11] TRAIN LOSS: 0.8853509517816397, SAMPLED NAME: uktie
[EPOCH 12] TRAIN LOSS: 0.8750358804678307, SAMPLED NAME: bgarra
[EPOCH 13] TRAIN LOSS: 0.8649777708909453, SAMPLED NAME: ynemena
[EPOCH 14] TRAIN LOSS: 0.8561465793695203, SAMPLED NAME: neruwsa
[EPOCH 15] TRAIN LOSS: 0.847805723165854, 