In [1]:
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline


fake = Faker()

# We need to seed these guys. For some reason I always use 101
fake.seed(101)
random.seed(101)

FORMATS = ['short', # d/M/YY
           'medium', # MMM d, YYY
           'medium',
           'medium',
           'long', # MMMM dd, YYY
           'long',
           'long',
           'long',
           'long',
           'full', # EEEE, MMM dd, YYY
           'full',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'd MMMM YYY',
           'd MMMM YYY',
           'd MMMM YYY',
           'd MMMM YYY',
           'dd/MM/YYY',
           'EE d, MMM YYY',
           'EEEE d, MMMM YYY']


for format in FORMATS:
    print('%s => %s' %(format, format_date(fake.date_object(), format=format, locale='en')))


Using TensorFlow backend.


short => 7/19/09
medium => Apr 3, 1983
medium => Sep 11, 2006
medium => May 29, 1994
long => October 15, 2001
long => April 20, 1973
long => February 24, 2015
long => April 7, 2004
long => August 6, 1984
full => Monday, December 20, 2010
full => Friday, February 1, 1985
full => Sunday, August 20, 1989
d MMM YYY => 14 Jan 2003
d MMMM YYY => 13 February 2017
d MMMM YYY => 14 June 1984
d MMMM YYY => 23 May 1992
d MMMM YYY => 22 October 1999
d MMMM YYY => 15 October 1974
dd/MM/YYY => 16/05/1987
EE d, MMM YYY => Sat 26, Feb 1983
EEEE d, MMMM YYY => Wednesday 17, December 1980


In [2]:
def random_date():
    dt = fake.date_object()

    try:
        date = format_date(dt, format=random.choice(FORMATS), locale='en')
        human_readable = date.lower().replace(',', '')
        machine_readable = dt.isoformat()

    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

In [3]:
def create_dataset(m):
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    
    for i in tqdm(range(m)):
        h, m, _ = random_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    # We also add two special chars, <unk> for unknown characters, and <pad> to add padding at the end
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v: k for k, v in inv_machine.items()}
 
    return dataset, human, machine, inv_machine

In [21]:
m = 5000
dataset, human_vocab, machine_vocab, inv_machine_vocab = create_dataset(m)

100%|██████████| 5000/5000 [00:00<00:00, 11740.44it/s]


In [22]:
dataset[:10]

[('8 may 2017', '2017-05-08'),
 ('13 march 1970', '1970-03-13'),
 ('july 10 2016', '2016-07-10'),
 ('3/31/15', '2015-03-31'),
 ('27 june 1978', '1978-06-27'),
 ('may 16 1997', '1997-05-16'),
 ('thursday 17 january 2019', '2019-01-17'),
 ('14 mar 1974', '1974-03-14'),
 ('6/27/98', '1998-06-27'),
 ('9 may 2009', '2009-05-09')]

In [23]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    X, Y = zip(*dataset)
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, np.array(Y), Xoh, Yoh

In [24]:
def string_to_int(string, length, vocab):
    string = string.lower()
    string = string.replace(',','')
    
    if len(string) > length:
        string = string[:length]
        
    rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    return rep

In [25]:
string_to_int('September 10, 1978', 30, human_vocab)

[28,
 16,
 26,
 29,
 16,
 23,
 13,
 16,
 27,
 0,
 3,
 2,
 0,
 3,
 11,
 9,
 10,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35,
 35]

In [26]:
Tx = 30
Ty = 10
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

X.shape: (5000, 30)
Y.shape: (5000, 10)
Xoh.shape: (5000, 30, 36)
Yoh.shape: (5000, 10, 11)


In [27]:
X = np.random.randint(5000, size=(5000, 64))
Y = np.random.randint(5000, size=(5000, 64))
Tx = 64
Ty = 64

In [28]:
index = 0
print("Source date:", dataset[index][0])
print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X[index])
print("Target after preprocessing (indices):", Y[index])
print()
print("Source after preprocessing (one-hot):", Xoh[index])
print("Target after preprocessing (one-hot):", Yoh[index])

Source date: 8 may 2017
Target date: 2017-05-08

Source after preprocessing (indices): [4007 3117 4514  232 1459 2939 1142 3381 1885 4460 3299 2922 4496 3689
 4738  803 1252 3601 1741   11 3236 1108  804 1475 1209  294 2656 3172
 4547  381 1180 4840 4657 3750 4051  741 4875  554 1369 1822  253 1026
  836  440 1284 1612 3099  169 2408 2108 1846 3178   52 2779 2825 1906
 1846 3031 2866 2156 2492 1082 3494 4049]
Target after preprocessing (indices): [3821 4087 1314 2245 4713 3623 1292 3324 1501 2164 4749 2955 1901 1414
  790 2788 2884 4519 4017 2020 4051 3967 4271  974 1557 4351 2337  664
 2389 4555 3229 4634 1923 2466 3476 2505 4700 1630  771 4531 3690 1473
 2468 2395 1635   81 2415 4105 1776 4521 3696 3209 3011 4962 2026 1359
  176  669 4271 1729  278 2160  528 1306]

Source after preprocessing (one-hot): [[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
Target after preprocessing (

In [29]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights')
dotor = Dot(axes = 1)

In [30]:
def one_step_attention(a, s_prev):
    s_prev = repeator(s_prev)
    concat = concatenator([a, s_prev])
    e = densor1(concat)
    energies = densor2(e)
    alphas = activator(energies)
    context = dotor([alphas, a])
    
    return context

In [31]:
len(machine_vocab)

11

In [32]:
n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(n_s, activation='softmax')

In [50]:
def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    outputs = []
    print(X)
    a = Bidirectional(LSTM(n_s, return_sequences = True))(X)
    
    for t in range(Ty):
        context = one_step_attention(a, s)
        s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c])
        out = output_layer(s)
        outputs.append(out)
    
    model = Model([X, s0, c0], outputs)
    return model

In [51]:
mod = model(Tx, Ty, n_a, n_s, 64, 11)
opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
mod.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

Tensor("input_6:0", shape=(?, 64, 64), dtype=float32)


KeyboardInterrupt: 

In [44]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))

In [45]:
#mod.fit([X, s0, c0], epochs=5, batch_size=100)
X_new = np.zeros((5000, 64, 64))
X_new[:,:,0] = X
result = mod.predict([X_new, s0, c0])


In [47]:
print(len(result[0]))
print(len(result))

5000
64
