In [1]:
import re
import random
from faker import Faker
from tqdm.auto import tqdm
from babel.dates import format_date

In [2]:
fake = Faker()

In [3]:
DATE_FORMATS = ['short',
                'medium',
                'medium',
                'medium',
                'long',
                'long',
                'long',
                'long',
                'long',
                'full',
                'full',
                'full',
                'd MMM YYY', 
                'd MMMM YYY',
                'd MMMM YYY',
                'd MMMM YYY',
                'd MMMM YYY',
                'd MMMM YYY',
                'dd/MM/YYY',
                'EE d, MMM YYY',
                'EEEE d, MMMM YYY',
                'd of MMMM YYY']

In [4]:
print('Sample dates for each format\n')

for format in DATE_FORMATS:
    print('%s => %s' %(format, format_date(fake.date_object(), format=format, locale='en')))

Sample dates for each format

short => 9/30/00
medium => May 6, 2015
medium => Oct 10, 2003
medium => Aug 18, 2014
long => November 3, 1981
long => January 28, 1992
long => January 27, 2019
long => March 4, 2003
long => November 15, 1986
full => Friday, October 4, 1974
full => Friday, February 3, 2006
full => Thursday, January 27, 2000
d MMM YYY => 20 Feb 2019
d MMMM YYY => 10 May 1996
d MMMM YYY => 13 April 1983
d MMMM YYY => 5 July 2006
d MMMM YYY => 20 December 2007
d MMMM YYY => 26 November 2001
dd/MM/YYY => 23/04/2014
EE d, MMM YYY => Sun 11, Aug 1991
EEEE d, MMMM YYY => Saturday 26, April 2008
d of MMMM YYY => 15 of January 1972


In [5]:
def clean_date(raw_date):
  return raw_date.lower().replace(',', '')

In [6]:
def random_date():
    dt = fake.date_object()

    try:
        date = format_date(dt, format=random.choice(DATE_FORMATS), locale='en')
        human_readable = clean_date(date)
        machine_readable = dt.isoformat()

    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

In [7]:
def create_dataset(m):
    dataset = []
    
    for i in tqdm(range(m)):
        h, m, _ = random_date()
        if h is not None:
            dataset.append((h, m))
 
    return dataset

In [None]:
m = 500000

dataset = create_dataset(m)

In [9]:
dataset[:5]

[('15 may 2013', '2013-05-15'),
 ('24 december 1991', '1991-12-24'),
 ('dec 25 1979', '1979-12-25'),
 ('19 april 2010', '2010-04-19'),
 ('1 february 1974', '1974-02-01')]