In [2]:
import re
import random
import numpy as np
from faker import Faker
from tqdm.auto import tqdm
from babel.dates import format_date
from sklearn.model_selection import train_test_split


from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
fake = Faker()

In [4]:
DATE_FORMATS = ['short',
                'medium',
                'medium',
                'medium',
                'long',
                'long',
                'long',
                'long',
                'long',
                'full',
                'full',
                'full',
                'd MMM YYY', 
                'd MMMM YYY',
                'd MMMM YYY',
                'd MMMM YYY',
                'd MMMM YYY',
                'd MMMM YYY',
                'dd/MM/YYY',
                'EE d, MMM YYY',
                'EEEE d, MMMM YYY',
                'd of MMMM YYY']

In [5]:
print('Sample dates for each format\n')

for format in DATE_FORMATS:
    print('%s => %s' %(format, format_date(fake.date_object(), format=format, locale='en')))

Sample dates for each format

short => 2/18/15
medium => Dec 27, 1979
medium => Jan 3, 1980
medium => May 19, 2016
long => August 12, 1977
long => September 15, 2013
long => May 2, 2007
long => January 1, 1989
long => March 23, 2016
full => Wednesday, December 28, 1994
full => Saturday, June 29, 2019
full => Friday, July 30, 1982
d MMM YYY => 25 May 2003
d MMMM YYY => 1 May 2002
d MMMM YYY => 21 October 2008
d MMMM YYY => 5 January 1993
d MMMM YYY => 12 September 1983
d MMMM YYY => 24 March 2012
dd/MM/YYY => 22/10/2012
EE d, MMM YYY => Wed 28, May 1980
EEEE d, MMMM YYY => Wednesday 23, March 2016
d of MMMM YYY => 8 of June 2020


In [6]:
def clean_date(raw_date):
  return raw_date.lower().replace(',', '')

In [7]:
def random_date():
    dt = fake.date_object()

    try:
        date = format_date(dt, format=random.choice(DATE_FORMATS), locale='en')
        human_readable = clean_date(date)
        machine_readable = dt.isoformat()

    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

In [8]:
def create_dataset(m):
    dataset = []
    
    for i in tqdm(range(m)):
        h, m, _ = random_date()
        if h is not None:
            dataset.append((h, m))
 
    return dataset

In [9]:
m = 500000

dataset = create_dataset(m)

HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))




In [10]:
dataset[:5]

[('feb 22 2016', '2016-02-22'),
 ('may 18 1973', '1973-05-18'),
 ('6 may 2005', '2005-05-06'),
 ('feb 9 2010', '2010-02-09'),
 ('april 4 1978', '1978-04-04')]

In [11]:
human_tokenizer = Tokenizer(char_level=True, oov_token='<oov>')
machine_tokenizer = Tokenizer(char_level=True)

human_tokenizer.fit_on_texts([i[0] for i in dataset])
machine_tokenizer.fit_on_texts([i[1] for i in dataset])

In [12]:
print(human_tokenizer.word_index)

{'<oov>': 1, ' ': 2, '1': 3, '9': 4, '2': 5, '0': 6, 'e': 7, 'a': 8, 'r': 9, 'u': 10, 'y': 11, '8': 12, '7': 13, 'm': 14, 'n': 15, 'b': 16, 'o': 17, 'd': 18, 't': 19, 's': 20, '3': 21, 'j': 22, 'c': 23, '6': 24, '5': 25, '4': 26, '/': 27, 'p': 28, 'f': 29, 'l': 30, 'h': 31, 'i': 32, 'g': 33, 'v': 34, 'w': 35}


In [13]:
max_len = max([len(i[0].replace(',', '')) for i in dataset]) + 1
max_len

28

In [14]:
def preprocess_input(date, tokenizer, max_len):
  seq = [i[0] for i in tokenizer.texts_to_sequences(date.lower().replace(',', ''))]
  seq = pad_sequences([seq], padding='post', maxlen=max_len)[0]
  return to_categorical(seq, num_classes=len(tokenizer.word_index)+1)

In [15]:
X, y = zip(*dataset)

X_ohe = np.array(list(map(lambda x: preprocess_input(x, human_tokenizer, max_len), X)))
y_ohe = np.array(list(map(lambda x: preprocess_input(x, machine_tokenizer, 10), y)))

In [16]:
print(X_ohe.shape)
print(y_ohe.shape)

(500000, 28, 36)
(500000, 10, 12)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y_ohe, test_size=0.3, random_state=0)

In [18]:
print(f'Train data shape: {X_train.shape}, {y_train.shape}')
print(f'Test data shape: {X_test.shape}, {y_test.shape}')

Train data shape: (350000, 28, 36), (350000, 10, 12)
Test data shape: (150000, 28, 36), (150000, 10, 12)
