In [44]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [84]:
# Load the data

df = pd.read_csv("data.csv")
input_dates = df['date_from']
output_dates = df['date_to']

# shuffle
df = df.sample(frac=1).reset_index(drop=True)

dataset_size = len(df)

In [86]:
# Helper functions and utilities

MONTH_MAP = {
  "January": "01",
  "February": "02",
  "March": "03",
  "April": "04",
  "May": "05",
  "June": "06",
  "July": "07",
  "August": "08",
  "September": "09",
  "October": "10",
  "November": "11",
  "December": "12",
}
MONTHS = MONTH_MAP.keys()

# All possible input chars
INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))

# All possible output chars
OUTPUT_CHARS = "0123456789-"

# Convert a date into char IDs
def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]

# Covert char ids into a date
def ids_to_date_strs(ids, chars=OUTPUT_CHARS):
    return ["".join([("?" + chars)[index] for index in sequence]) for sequence in ids]

print ("Example encoded input: ", date_str_to_ids(input_dates[0], INPUT_CHARS))
print ("Example encoded output: ", date_str_to_ids(output_dates[0], OUTPUT_CHARS))

# Convert all date strings into char id tensors
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor() # using 0 as the padding token ID

# Prepare entire dataset
def create_dataset(input_dates, output_dates):
    return prepare_date_strs(input_dates, INPUT_CHARS), prepare_date_strs(output_dates, OUTPUT_CHARS)

Example encoded input:  [15, 20, 29, 35, 20, 32, 37, 0, 3, 11, 1, 0, 5, 11, 9, 3]
Example encoded output:  [3, 9, 7, 1, 10, 0, 1, 10, 1, 9]


In [87]:
# Split to train and valid

dataset = create_dataset(input_dates, output_dates)
dataset_size = len(dataset)
train_size = int(0.8 * len(dataset[0]))

X_train, y_train = dataset[0][:train_size], dataset[1][:train_size]
X_valid, y_valid = dataset[0][train_size:], dataset[1][train_size:]

In [88]:
# Encoder-decoder version 1

embedding_size = 32
max_output_length = y_train.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

encoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1, output_dim=embedding_size, input_shape=[None]),
    tf.keras.layers.LSTM(128)
])

decoder = tf.keras.Sequential([
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")
])

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = tf.keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [91]:
# Not sure what's the problem with this model. I did it almost exactly the same as the one in the book. 
# The only thing that may cause an issue is the data itself and its distribution.

test_date = "January 17, 1994"
test_date_encoded = date_str_to_ids(test_date, INPUT_CHARS)
char_ids_inference = model.predict([test_date_encoded]).argmax(axis=-1)
ids_to_date_strs(char_ids_inference)



['8888800874']