In [None]:
!pip install faker
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import pickle
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu
%matplotlib inline

# Generating Dataset

In [None]:
from faker import Faker
fake = Faker()

# We need to seed these guys. For some reason I always use 101
Faker.seed(101)
random.seed(101)

In [None]:
FORMATS = ['short', # d/M/YY
           'medium', # MMM d, YYY
           'medium',
           'medium',
           'long', # MMMM dd, YYY
           'long',
           'long',
           'long',
           'long',
           'full', # EEEE, MMM dd, YYY
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'd MMMM YYY',
           'd MMMM YYY',
           'd MMMM YYY',
           'd MMMM YYY',
           'dd/MM/YYY',
           'EE d, MMM YYY',
           'EEEE d, MMMM YYY']

In [None]:
for format in FORMATS:
    print('%s => %s' %(format, format_date(fake.date_object(), format=format, locale='en')))

In [None]:
def random_date():
    dt = fake.date_object()

    try:
        date = format_date(dt, format=random.choice(FORMATS), locale='en')
        human_readable = date.lower().replace(',', '')
        machine_readable = dt.isoformat()

    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt


create_dataset(m) will generate our dataset, taking m as the number of samples to create. It returns the dataset as a list, two dictionaries mapping index to character (these are our vocabularies), human and machine, and the inverse mapping, inv_machine, chars to index:

In [None]:
def create_dataset(m):
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    
    for i in tqdm(range(m)):
        h, m, _ = random_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    # We also add two special chars, <unk> for unknown characters, and <pad> to add padding at the end
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v: k for k, v in inv_machine.items()}
 
    return dataset, human, machine, inv_machine

Let's generate a dataset with 30k samples. That's probably way too much, but it should do a good job

In [None]:
# m = 30000
# dataset, human_vocab, machine_vocab, inv_machine_vocab = create_dataset(m)

Inspecting the first 10 entries. Remember it contains a list of tuples => (human readable, machine readable):

In [None]:
# dataset[:10]

In [None]:
# Path to 'TrainSet' file
trainSetFile = "../input/iut-machine-translation/TrainSet.pickle"

# Openning the file, and load contents
with open(trainSetFile, 'rb') as file:
    trainSet = pickle.load(file)

# Samples
print("Number of samples in train set:", len(trainSet), "\n")
for i in range(10):
    print("Human Input:", trainSet[i][0], "\t\tMachine Readable:", trainSet[i][1])

In [None]:
trainSet[:10]

In [None]:
# Path to 'ValidationSet' file
validationSetFile = "../input/iut-machine-translation/ValidationSet.pickle"

# Openning the file, and load contents
with open(validationSetFile, 'rb') as file:
    validationSet = pickle.load(file)
print("Number of samples in validation set:", len(validationSet), "\n")


# Path to 'TestSet' file
testSetFile = "../input/iut-machine-translation/TestSet.pickle"

# Openning the file, and load contents
with open(testSetFile, 'rb') as file:
    testSet = pickle.load(file)
print("Number of samples in validation set:", len(testSet))

In [None]:
human_vocab = {
    ' ': 0,
    '/': 1,
    '۰': 2,
    '۱': 3,
    '۲': 4,
    '۳': 5,
    '۴': 6,
    '۵': 7,
    '۶': 8,
    '۷': 9,
    '۸': 10,
    '۹': 11,
    'ا': 12,
    'ب': 13,
    'پ': 14,
    'ت': 15,
    'ث': 16,
    'ج': 17,
    'چ': 18,
    'ح': 19,
    'خ': 20,
    'د': 21,
    'ذ': 22,
    'ر': 23,
    'ز': 24,
    'س': 25,
    'ش': 26,
    'ص': 27,
    'ض': 28,
    'ط': 29,
    'ظ': 30,
    'ع': 31,
    'غ': 32,
    'ف': 33,
    'ق': 34,
    'ک': 35,
    'گ': 36,
    'ل': 37,
    'م': 38,
    'ن': 39,
    'و': 40,
    'ه': 41,
    'ی': 42,
    '0': 43,
    '1': 44,
    '2': 45,
    '3': 46,
    '4': 47,
    '5': 48,
    '6': 49,
    '7': 50,
    '8': 51,
    '9': 52,
    '‌': 0,
    '<unk>': 53,
    '<pad>': 54
}

human_vocab

In [None]:
machine_vocab = {
    '-': 0,
    '0': 1,
    '1': 2,
    '2': 3,
    '3': 4,
    '4': 5,
    '5': 6,
    '6': 7,
    '7': 8,
    '8': 9,
    '9': 10
}

In [None]:
inv_machine_vocab = {v: k for k, v in machine_vocab.items()}
inv_machine_vocab

# Preprocessing

In [None]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    X, Y = zip(*dataset)
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]

    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, np.array(Y), Xoh, Yoh

In [None]:
def string_to_int(string, length, vocab):
    string = string.replace(',','')
    
    if len(string) > length:
        string = string[:length]
        
    rep = list(map(lambda x: vocab.get(x, vocab.get('<unk>')), string))
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    return rep

In [None]:
string_to_int('‌', 100, human_vocab)

In [None]:
Tx = 50
Ty = 10
X, Y, Xoh, Yoh = preprocess_data(trainSet, human_vocab, machine_vocab, Tx, Ty)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

In [None]:
index = 0
print("Source date:", trainSet[index][0])
print("Target date:", trainSet[index][1])
print()
print("Source after preprocessing (indices):", X[index])
print("Target after preprocessing (indices):", Y[index])
print()
print("Source after preprocessing (one-hot):", Xoh[index])
print("Target after preprocessing (one-hot):", Yoh[index])

# Define Model

In [None]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights')
dotor = Dot(axes = 1)

In [None]:
def one_step_attention(a, s_prev):
    s_prev = repeator(s_prev)
    concat = concatenator([a, s_prev])
    e = densor1(concat)
    energies = densor2(e)
    alphas = activator(energies)
    context = dotor([alphas, a])
    
    return context

In [None]:
n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation='softmax')

In [None]:
def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    outputs = []
    
    a = Bidirectional(LSTM(n_a, return_sequences = True))(X)
    
    for t in range(Ty):
        context = one_step_attention(a, s)
        s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c])
        out = output_layer(s)
        outputs.append(out)
    
    model = Model([X, s0, c0], outputs)
    return model

In [None]:
mod = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))

In [None]:
mod.summary()

# Train The Model

In [None]:
opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
mod.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
s0 = np.zeros((len(trainSet), n_s))
c0 = np.zeros((len(trainSet), n_s))
outputs = list(Yoh.swapaxes(0,1))
mod.fit([Xoh, s0, c0], outputs, epochs=20, batch_size=100)

# Testing the Model

In [None]:
expected = []
for t in testSet:
    expected.append(t[0])

In [None]:
bleu_scores = []
true = 0
false = 0
for example in testSet:
    source = string_to_int(example[0], Tx, human_vocab)
    source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))
    source = source.reshape((1, ) + source.shape)
    prediction = mod.predict([source, s0, c0])
    prediction = np.argmax(prediction, axis = -1)
    output = [inv_machine_vocab[int(i)] for i in prediction]
    bleu_score = sentence_bleu(example[1], ''.join(output))
    bleu_scores.append(bleu_score)
    
    print("source:", example[0])
    print("output:", ''.join(output))
    print("expected:", example[1])
    if ''.join(output) == example[1]:
        true += 1
    else:
        false += 1
    print("bleu_score:", bleu_score)
    print("----")

In [None]:
print("bleu_scores:", np.average(bleu_scores))

In [None]:
true / len(testSet)

In [None]:
len("چهار‌شنبه دی ماه ۲۲ هزار و سیصد و نود و چهار")