In [1]:
import tensorflow as tf
import numpy as np
import time
import json
import random
from faker import Faker
import babel
from babel.dates import format_date
import os
import tensorflow.contrib.legacy_seq2seq as seq2seq
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
fake = Faker()
fake.seed(42)
random.seed(42)

FORMATS=['short',
        'medium',
        'long',
        'full',
        'd MMM YYY',
        'd MMMM YYY',
        'dd MMM YYY',
        'd MMM, YYY',
        'd MMMM, YYY',
        'dd, MMM YYY',
        'd MM YY',
        'd MMMM YYY',
        'MMMM d YYY',
        'MMMM d, YYY',
        'dd.MM.YY',
           ]

# change this if you want it to work with only a single language
LOCALES = babel.localedata.locale_identifiers()
LOCALES = [lang for lang in LOCALES if 'en' in str(lang)]

In [3]:
def create_date():
    """
        Creates some fake dates 
        :returns: tuple containing 
                  1. human formatted string
                  2. machine formatted string
                  3. date object.
    """
    dt = fake.date_object()

    # wrapping this in a try catch because
    # the locale 'vo' and format 'full' will fail
    try:
        human = format_date(dt,
                            format=random.choice(FORMATS),
                            locale=random.choice(LOCALES))

        case_change = random.randint(0,3) # 1/2 chance of case change
        if case_change == 1:
            human = human.upper()
        elif case_change == 2:
            human = human.lower()

        machine = dt.isoformat()
    except AttributeError as e:
        return None, None, None

    return human, machine #, dt

data = [create_date() for _ in range(50000)]

In [4]:
data[:5]

[('7 07 13', '2013-07-07'),
 ('30 JULY 1977', '1977-07-30'),
 ('Tuesday, 14 September 1971', '1971-09-14'),
 ('18 09 88', '1988-09-18'),
 ('31, Aug 1986', '1986-08-31')]

In [5]:
x = [x for x,y in data]
y = [y for x,y in data]

u_charactersX = set(' '.join(x))
character2numX = dict(zip(u_charactersX,range(len(u_charactersX))))

u_charactersY = set(' '.join(y))
character2numY = dict(zip(u_charactersY,range(len(u_charactersY))))


In [6]:
character2numX['<PAD>']=len(character2numX)

In [7]:
num2charx = dict(zip(character2numX.values(),character2numX.keys()))
max_len = max([len(date) for date in x])
x = np.array([[character2numX['<PAD>']] * (max_len-len(date))+[character2numX[x_] for x_ in date] for date in x])

In [8]:
character2numY['<GO>']=len(character2numY)
max_len = max([len(date) for date in y])
y = np.array([[character2numY['<GO>']] + [character2numY[y_] for y_ in date] for date in y])


In [9]:
num2chary = dict(zip(character2numY.values(),character2numY.keys()))

In [10]:
x_seq_length= len(x[0])
y_seq_length=len(y[0])-1

In [11]:
x_seq_length

29

In [12]:
y_seq_length

10

In [13]:
def gen_batch(x,y,batch_size):
    shuffle = np.random.permutation(len(x))
    start = 0
    x = x[shuffle]
    y = y[shuffle]
    
    while start+batch_size <= len(x):
        yield x[start:start+batch_size],y[start:start+batch_size]
        start +=batch_size

In [14]:
epoch = 20
batch_size=128
nodes =32
embedding = 10

tf.reset_default_graph()
sess = tf.InteractiveSession()

# Tensor where we will feed the data into graph
inputs = tf.placeholder(tf.int32,[None,x_seq_length],name='inputs')
outputs = tf.placeholder(tf.int32,[None,None],name='outputs')
targets = tf.placeholder(tf.int32,[None,None],name='targets')

# Embedding layers

input_embedding = tf.Variable(tf.random_uniform([len(character2numX),embedding],-1.0,1.0),name='x_embeding')
output_embedding = tf.Variable(tf.random_uniform([len(character2numY),embedding],-1.0,1.0),name='y_embeding')

date_input_embed = tf.nn.embedding_lookup(input_embedding, inputs)
date_output_embed = tf.nn.embedding_lookup(output_embedding, outputs)

with tf.variable_scope("encoding") as encoding_scope:
    lstm_enc_fw = tf.contrib.rnn.BasicLSTMCell(nodes)
    lstm_enc_bw = tf.contrib.rnn.BasicLSTMCell(nodes)
    #_, last_state = tf.nn.dynamic_rnn(lstm_enc, inputs=date_input_embed, dtype=tf.float32)
    ((lstm_fw_out,lstm_bw_out),(lstm_fw_final,lstm_bw_final))=tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_enc_fw,
                                                                                             cell_bw=lstm_enc_bw,
                                                                                             inputs=date_input_embed,
                                                                                             dtype=tf.float32)
    enc_fin_c = tf.concat((lstm_fw_final.c,lstm_bw_final.c),1)
    enc_fin_h = tf.concat((lstm_fw_final.h,lstm_bw_final.h),1)
    last_state = tf.contrib.rnn.LSTMStateTuple(c=enc_fin_c,h=enc_fin_h)

with tf.variable_scope("decoding") as decoding_scope:
    lstm_dec = tf.contrib.rnn.BasicLSTMCell(nodes*2)
    dec_outputs, _ = tf.nn.dynamic_rnn(lstm_dec, inputs=date_output_embed, initial_state=last_state)
    
logits = tf.layers.dense(dec_outputs, units=len(character2numY)) 
with tf.name_scope("optimization"):
    # Loss function
    loss = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.ones([batch_size, y_seq_length]))
    # Optimizer
    optimizer = tf.train.RMSPropOptimizer(1e-3).minimize(loss)

In [15]:
dec_outputs.get_shape().as_list()

[None, None, 64]

In [16]:
last_state[0].get_shape().as_list()

[None, 64]

In [17]:

date_input_embed.get_shape().as_list()

[None, 29, 10]

In [18]:
inputs.get_shape().as_list()

[None, 29]

In [19]:
logits.get_shape()

TensorShape([Dimension(None), Dimension(None), Dimension(13)])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [26]:
sess.run(tf.global_variables_initializer())
epochs = 20
for epoch_i in range(epochs):
    start_time = time.time()
    for batch_i, (source_batch, target_batch) in enumerate(gen_batch(X_train, y_train, batch_size)):
        _, batch_loss, batch_logits = sess.run([optimizer, loss, logits],
            feed_dict = {inputs: source_batch,
             outputs: target_batch[:, :-1],
             targets: target_batch[:, 1:]})
    accuracy = np.mean(batch_logits.argmax(axis=-1) == target_batch[:,1:])
    print('Epoch {:3} Loss: {:>6.3f} Accuracy: {:>6.4f} Epoch duration: {:>6.3f}s'.format(epoch_i, batch_loss, accuracy, time.time() - start_time)) 

Epoch   0 Loss:  0.995 Accuracy: 0.6297 Epoch duration:  8.549s
Epoch   1 Loss:  0.684 Accuracy: 0.7406 Epoch duration:  7.962s
Epoch   2 Loss:  0.527 Accuracy: 0.8156 Epoch duration:  7.753s
Epoch   3 Loss:  0.450 Accuracy: 0.8305 Epoch duration:  7.400s
Epoch   4 Loss:  0.394 Accuracy: 0.8461 Epoch duration:  7.587s
Epoch   5 Loss:  0.315 Accuracy: 0.8812 Epoch duration:  7.831s
Epoch   6 Loss:  0.277 Accuracy: 0.9047 Epoch duration:  8.463s
Epoch   7 Loss:  0.191 Accuracy: 0.9383 Epoch duration:  7.849s
Epoch   8 Loss:  0.152 Accuracy: 0.9539 Epoch duration:  7.381s
Epoch   9 Loss:  0.160 Accuracy: 0.9477 Epoch duration:  7.323s
Epoch  10 Loss:  0.117 Accuracy: 0.9602 Epoch duration:  7.355s
Epoch  11 Loss:  0.090 Accuracy: 0.9711 Epoch duration:  7.419s
Epoch  12 Loss:  0.167 Accuracy: 0.9461 Epoch duration:  8.140s
Epoch  13 Loss:  0.038 Accuracy: 0.9930 Epoch duration:  8.476s
Epoch  14 Loss:  0.034 Accuracy: 0.9922 Epoch duration:  8.687s
Epoch  15 Loss:  0.048 Accuracy: 0.9914 

In [29]:
source_batch, target_batch = next(gen_batch(X_test, y_test, batch_size))
dec_input = np.zeros((len(source_batch), 1)) + character2numY['<GO>']
for i in range(y_seq_length):
    batch_logits = sess.run(logits,
                feed_dict = {inputs: source_batch,
                 outputs: dec_input})
    prediction = batch_logits[:,-1].argmax(axis=-1)
    dec_input = np.hstack([dec_input, prediction[:,None]])
    
print('Accuracy on test set is: {:>6.3f}'.format(np.mean(dec_input == target_batch)))

Accuracy on test set is:  0.988


In [30]:
num_preds = 10
source_chars = [[num2charx[l] for l in sent if num2charx[l]!="<PAD>"] for sent in source_batch[:num_preds]]
dest_chars = [[num2chary[l] for l in sent] for sent in dec_input[:num_preds,1:]]

for date_in, date_out in zip(source_chars, dest_chars):
    print(''.join(date_in)+' => '+''.join(date_out))

10 08 98 => 1998-08-10
29, may 1990 => 1990-05-29
20 jul 1970 => 1970-07-20
15 september, 2002 => 2002-09-15
15 09 15 => 2015-09-15
01/09/2002 => 2002-01-09
24.11.91 => 1991-11-24
13 SEPTEMBER 2000 => 2000-09-13
12, Sep 1993 => 1993-09-12
march 10 2007 => 2007-03-10


In [24]:
source_batch[:2]

array([[59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 23, 23,  7, 37, 20, 28,  7, 27,  1,  1, 19],
       [59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
         1,  7, 50, 52, 48, 52, 15, 31,  7, 23, 32, 27, 27]])

In [32]:
dec_input[:2,1:].shape

(2, 10)