In [2]:
import numpy as np 
import pandas as pd 
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [274]:
fake = Faker()
Faker.seed(12345)
random.seed(12345)

In [336]:
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

# change this if you want it to work with another language
LOCALES = ['en_US']

In [337]:
def load_date():
    """
        Loads some fake dates 
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS),  locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()
        
    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """
    
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    

    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], 
                     list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v:k for k,v in inv_machine.items()}
    return dataset, human, machine, inv_machine

In [338]:
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|██████████| 10000/10000 [00:00<00:00, 43742.25it/s]


In [339]:
human_dates, machine_dates = zip(*dataset)

X, Y = [], []

Tx = 30
Ty = 10

for string in human_dates:
    rep = list(map(lambda x: human_vocab.get(x, '<unk>'), string))
    if len(string) < Tx:
        rep += [human_vocab['<pad>']] * (Tx - len(string))
    X.append(rep)
    

for date in machine_dates:
    rep = list(map(lambda x: machine_vocab.get(x), date))
    Y.append(rep)

In [340]:
X = torch.tensor(X, dtype=torch.long)
Y = torch.tensor(Y, dtype=torch.long)

In [341]:
class TranslationData(Dataset):
    
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [342]:
batch_size = 32
dataset = TranslationData(X, Y)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [343]:
# Defining model
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.GRU = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        
        
    def forward(self, x):
        x = self.embedding(x)
        o, h = self.GRU(x)
        return o, h  

In [19]:
x = tensor = torch.zeros(32, 10, dtype=torch.long)
e = Encoder(30, 64, 40)

In [37]:
e.embedding(x).shape

torch.Size([32, 10, 64])

In [20]:
o, h = e.forward(x)
print(o.shape)
print(h.shape)

torch.Size([32, 10, 40])
torch.Size([1, 32, 40])


In [22]:
h_tmp = h.squeeze()

In [25]:
repeat_h = h_tmp.view(32, 1, 40).repeat(1, 10, 1)
repeat_h.shape

torch.Size([32, 10, 40])

In [26]:
concat_vector = torch.cat([o, repeat_h], dim=2)
concat_vector.shape

torch.Size([32, 10, 80])

In [29]:
att_hidden = nn.Linear(80, 40)
att_final = nn.Linear(40, 1)

In [30]:
f1 = att_hidden(concat_vector)
f2 = att_final(f1)
f2.shape

torch.Size([32, 10, 1])

In [344]:
class Attention(nn.Module):
    
    def __init__(self, embedding_dim):
        super(Attention, self).__init__()
        self.embedding_dim = embedding_dim
        self.fc = nn.Linear(2 * embedding_dim, embedding_dim)
        self.out = nn.Linear(embedding_dim, 1)
        
    def forward(self, decoder_hidden, encoder_output):
        # decoder hidden (1, batch_size, hidden_dim)
        # encoder output (batch_size, Tx, hidden_dim)
        Tx = encoder_output.shape[1]
        decoder_hidden = decoder_hidden.permute(1, 0, 2)
        decoder_hidden_repeat = decoder_hidden.repeat(1, Tx, 1)
        concat = torch.concat([encoder_output, decoder_hidden_repeat], dim=2)
        att_hidden = self.fc(concat) #(batch_size, TX, hidden_dim)
        raw_att = self.out(att_hidden) #(batch_size, Tx, 1)
        raw_att = raw_att.squeeze(2) # (batch_size, Tx)
        att = torch.softmax(raw_att, dim=1) # (batch_size, Tx)
        att = att.unsqueeze(1) #(batch_size, 1, Tx)
        context = torch.matmul(att, encoder_output) #(batch_size, 1, hidden_dim)
        return context

In [151]:
c = torch.sum(f2 * o, dim=1, keepdim=True)

In [89]:
c.shape

torch.Size([32, 1, 40])

In [345]:
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, encoder_hidden_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.encoder_hidden_dim = encoder_hidden_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.GRU = nn.GRU(embedding_dim + encoder_hidden_dim, hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, context, hidden):
        x = self.embedding(x)
        context_input = torch.concat([context, x], axis=2)
        o, h = self.GRU(context_input, hidden)
        o = self.out(o)
        return o, h   

In [100]:
d = Decoder(30, 34, 40, 50)

In [101]:
x1 = torch.zeros(32, 1, dtype=torch.long)

In [102]:
h_d = torch.randn((1, 32, 50))

In [103]:
do, dh = d(x1, c, h_d)

In [104]:
do.shape, dh.shape

(torch.Size([32, 1, 30]), torch.Size([1, 32, 50]))

In [346]:
class AttentionNMT(nn.Module):
    
    def __init__(self, encoder, decoder, attention):
        super(AttentionNMT, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        Ty = trg.shape[1] 
        decoder_vocab_size = self.decoder.vocab_size
        encoder_output, encoder_hidden = self.encoder(src)
        decoder_hidden_dim = self.decoder.hidden_dim
        batch_size = src.shape[0]
        decoder_hidden = encoder_hidden
        x = trg[:, 0:1]
        outputs = torch.zeros((batch_size, Ty, decoder_vocab_size))
        for ty in range(1, Ty):
            context = self.attention(decoder_hidden, encoder_output)
            output, decoder_hidden = self.decoder(x, context, decoder_hidden)
            outputs[:, ty:ty+1, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(2)
            x = trg[:, ty:ty+1] if teacher_force else top1
        return outputs    
        

In [298]:
encoder = Encoder(30, 12, 18)
attention = Attention(18)
decoder = Decoder(30, 12, 18, 18)

In [299]:
atnmt = AttentionNMT(encoder, decoder, attention)

In [300]:
src = torch.zeros((8, 20), dtype=torch.long)
trg = torch.zeros((8, 20), dtype=torch.long)

In [301]:
Ty = trg.shape[1]
decoder_vocab_size = atnmt.decoder.vocab_size
encoder_output, encoder_hidden = atnmt.encoder(src)
batch_size = src.shape[0]
decoder_hidden = encoder_hidden


In [302]:
x_debug = trg[:, 0:1]
x_debug.shape

torch.Size([8, 1])

In [303]:
outputs = torch.zeros((batch_size, Ty, decoder_vocab_size))

In [304]:
decoder_hidden.shape, encoder_output.shape

(torch.Size([1, 8, 18]), torch.Size([8, 20, 18]))

In [306]:
context = atnmt.attention(decoder_hidden, encoder_output)

In [307]:
context.shape

torch.Size([8, 1, 18])

In [308]:
decoder_hidden = torch.randn((1, 8, 18))

In [309]:
output, decoder_hidden = atnmt.decoder(x_debug, context, decoder_hidden)

In [310]:
atnmt(src, trg).shape

torch.Size([8, 20, 30])

In [288]:
Y.shape

torch.Size([10000, 10])

In [347]:
human_vocab_size = len(human_vocab)
machine_vocab_size = len(machine_vocab)

embedding_dim = 10
hidden_dim = 64

In [357]:
encoder = Encoder(human_vocab_size, embedding_dim, hidden_dim)
attention = Attention(hidden_dim)
decoder = Decoder(machine_vocab_size, embedding_dim, hidden_dim, hidden_dim)

In [358]:
model = AttentionNMT(encoder, decoder, attention)


In [359]:
optimizer = optim.AdamW(model.parameters())
criterion = nn.CrossEntropyLoss()


In [376]:
n_epochs = 100
for epoch in range(n_epochs):
    total_loss = 0
    for X, Y in dataloader:
        optimizer.zero_grad()
        outputs = model(X, Y)
        #print(outputs.shape)
        #print(Y.shape)
        loss = criterion(outputs.view(-1, machine_vocab_size), Y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(dataloader)}')   
    
    

Epoch [1/100], Loss: 0.24060807046227561
Epoch [2/100], Loss: 0.24053240641237447
Epoch [3/100], Loss: 0.2495299641031046
Epoch [4/100], Loss: 0.2412043185279773
Epoch [5/100], Loss: 0.24042587786817704
Epoch [6/100], Loss: 0.2427664488649216
Epoch [7/100], Loss: 0.2435496563728625
Epoch [8/100], Loss: 0.24036414659442232
Epoch [9/100], Loss: 0.240243951923931
Epoch [10/100], Loss: 0.24010902743179577
Epoch [11/100], Loss: 0.2400962632780258
Epoch [12/100], Loss: 0.24007657684457187
Epoch [13/100], Loss: 0.24043203599917623
Epoch [14/100], Loss: 0.24594179700357846
Epoch [15/100], Loss: 0.24088659430273807
Epoch [16/100], Loss: 0.24008390502617383
Epoch [17/100], Loss: 0.2400757955571714
Epoch [18/100], Loss: 0.23996134811696915
Epoch [19/100], Loss: 0.23995831365973805
Epoch [20/100], Loss: 0.24022830241975693
Epoch [21/100], Loss: 0.2474093888514339
Epoch [22/100], Loss: 0.2401900974611124
Epoch [23/100], Loss: 0.24003631871538803
Epoch [24/100], Loss: 0.24005432764943035
Epoch [25/1

In [386]:
def decode_date(human_date):
    rep = list(map(lambda x: human_vocab.get(x, '<unk>'), human_date))
    if len(human_date) < Tx:
        rep += [human_vocab['<pad>']] * (Tx - len(human_date))
    
    X = torch.tensor(rep, dtype=torch.long)
    X = X.view(1, X.shape[0])
    
    date_start_idx = machine_vocab.get("2")
    decoder_input = torch.tensor([date_start_idx]).view(1, 1)
    with torch.no_grad():
        encoder_output, encoder_hidden = encoder(X)
        decoder_hidden = encoder_hidden
        output = [date_start_idx]
        for ty in range(1, Ty):
            context = model.attention(decoder_hidden, encoder_output)
            decoder_output, decoder_hidden = model.decoder(decoder_input, context, decoder_hidden)
            #print(decoder_output.shape)
            top_token_idx = torch.argmax(decoder_output, dim=2)
            output.append(top_token_idx.item())
            decoder_input = top_token_idx
    
    output = output
    ds = list(map(lambda x: inv_machine_vocab.get(x), output))
    ds = "".join(ds)
    return ds

In [387]:
decode_date("4th of july 2001")

'2001-06-04'

In [388]:
torch.save(model, "nmt_attention.pth")