# RNN Date Generation Demo on Pytoch Lightning: Date Generation (One-to-Many)

In this demo, we will show you how to create a date generator using Pytoch Lightning. This demo is inspired by Andrew Ng's deeplearning.ai course on sequence models. In this demo, we create a one-to-many RNN model for generating date in the following format: e.g. "2002-03-11".  

In [1]:
import csv
import numpy as np
import random
import math
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
!pip install lightning
import lightning as L
from lightning import Trainer


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Generate Dataset
We generate a toy dataset using datetime library.  The target output only comes in one format (iso format).

In [2]:
#Generating a toy dataset
import datetime
base = datetime.datetime.today()
base = datetime.date(base.year, base.month, base.day)
date_list = [base - datetime.timedelta(days=x) for x in range(0, 1500)]
data = [date.isoformat() for date in date_list]
print(data[:5])
maxlen=10 #all the seqeunces have 10 characters

['2025-01-20', '2025-01-19', '2025-01-18', '2025-01-17', '2025-01-16']


In [3]:
chars = list(set(''.join(data)))
data_size, vocab_size = len(data), len(chars)
print('There are %d lines and %d unique characters in your data.' % (data_size, vocab_size))
print("max length =",maxlen)
sorted_chars= sorted(chars)
print(sorted_chars)

There are 1500 lines and 11 unique characters in your data.
max length = 10
['-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [4]:
# In this demo, we will use "<S>" as a seed character to initiate the sequence
sorted_chars.insert(0,"<S>")
vocab_size = len(sorted_chars)

print(f"All Characters: {sorted_chars}")
print(f"Vocab Size: {vocab_size}")

All Characters: ['<S>', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Vocab Size: 12


In [5]:
# Quick implementation of character tokenizer
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(sorted_chars) }
itos = { i:ch for i,ch in enumerate(sorted_chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("2024-10-26"))
print(decode(encode("2024-10-26")))

[4, 2, 4, 6, 1, 3, 2, 1, 4, 8]
2024-10-26


In [6]:
itos

{0: '<S>',
 1: '-',
 2: '0',
 3: '1',
 4: '2',
 5: '3',
 6: '4',
 7: '5',
 8: '6',
 9: '7',
 10: '8',
 11: '9'}

In [7]:
stoi

{'<S>': 0,
 '-': 1,
 '0': 2,
 '1': 3,
 '2': 4,
 '3': 5,
 '4': 6,
 '5': 7,
 '6': 8,
 '7': 9,
 '8': 10,
 '9': 11}

# Preprocessing data

In [8]:
#Encoding data
encoded = []
for line in data:
    line = [l for l in line] #change from string to list
    indices = encode(line)
    encoded.append(indices)

In [9]:
class DateDataset(Dataset):
  def __init__(self, data):
    data = [[0] + d for d in data] # add <s> at the start of every data point
    self.encoded = torch.LongTensor(data)

  def __getitem__(self, idx):
    return self.encoded[idx]

  def __len__(self):
    return len(self.encoded)

In [10]:
class DateDataModule(L.LightningDataModule):

  def __init__(self, train_data, batch_size, num_workers=0):
      super().__init__()
      self.train_data = train_data
      self.batch_size = batch_size
      self.num_workers = num_workers


  def setup(self, stage: str):
    pass

  def collate_fn(self, batch):
      one_hot_x = torch.stack([F.one_hot(b, num_classes=vocab_size) for b in batch])
      return {"x": one_hot_x.float(), "y": torch.stack(batch)}

  def train_dataloader(self):
      train_dataset = DateDataset(self.train_data)
      train_loader = DataLoader(train_dataset,
                                batch_size = self.batch_size,
                                shuffle = True,
                                collate_fn = self.collate_fn,
                                num_workers = self.num_workers)

      return train_loader

In [11]:
batch_size = 16
data_module = DateDataModule(encoded, batch_size=batch_size,num_workers=0)

# Create & train model


In [12]:
class SimpleRNN(L.LightningModule):
    def __init__(self, vocab_size, learning_rate, criterion):

        super().__init__()
        self.hidden_dim = 16
        self.vocab_size = vocab_size
        # rnn cell = single RNN cell
        self.rnn = nn.RNNCell(self.vocab_size, self.hidden_dim) # vocab_size is the input size (embedding dim)

        self.fc = nn.Linear(self.hidden_dim, self.vocab_size)
        self.learning_rate = learning_rate
        self.criterion = criterion


    def forward(self, src, hx):
        hx = self.rnn(src, hx)
        prediction_logit = self.fc(hx)
        return prediction_logit, hx

    def training_step(self, batch, batch_idx):
        src = batch['x'][:, :-1]
        target = batch['y'][:, 1:]
        temp = []
        hx = torch.randn(src.shape[0], self.hidden_dim).to(self.rnn.weight_ih.device)
        prediction = torch.zeros((src.shape[0], src.shape[1], self.vocab_size) ,device=hx.device)

        for i in range(src.shape[1]):
          prediction_logit, hx = self(src[:,i], hx) # forward(current_char, hidden_state)
          prediction[:, i, :] = prediction_logit

        prediction = prediction.reshape(-1, vocab_size)
        target = target.reshape(-1)
        loss = self.criterion(prediction, target)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.learning_rate)

In [13]:
criterion = nn.CrossEntropyLoss()
vocab_size = vocab_size
lr = 0.005
model = SimpleRNN(vocab_size, lr, criterion)

In [14]:
def generate(model):
  model.eval()
  with torch.no_grad():
    output_list = []
    input = F.one_hot(torch.zeros([1], dtype=torch.long), num_classes=vocab_size)
    input = input.float()
    input = input.to(model.device)
    hx = torch.randn(input.shape[0], 16).to(model.device)
    for i in range(10):
      logit, hx = model(input, hx)
      prob = F.softmax(logit, dim=-1)
      pred = torch.multinomial(prob, 1)
      output = pred.item()
      output_list.append(output)

      input = F.one_hot(torch.tensor([output], dtype=torch.long), num_classes=vocab_size) # use the predicted character as the next input
      input = input.float()
      input = input.to(model.device)
  return decode(output_list)

In [15]:
class PrintCallback(L.pytorch.callbacks.Callback):
  def __init__(self, what="epochs", verbose=True):
        self.what = what
        self.verbose = verbose
        self.state = {"epochs": 0, "batches": 0}

  def on_train_epoch_end(self, *args, **kwargs):
        if self.what == "epochs":
            self.state["epochs"] += 1
        if self.state["epochs"] % 2 == 0:
            print('----- Generating text after Epoch: %d' % self.state["epochs"])
            for i in range(3):
              print(generate(model))


In [16]:
trainer = Trainer(
    max_epochs=10,
    callbacks=[PrintCallback()]
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/idhibhatpankam/Code/courses/NLP-SYS/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


# Let's train the model and generate some text

In [17]:
for i in range(3): #before training
  print(generate(model))

940-584-20
7341<S><S>6367
3032-0-542


In [18]:
trainer.fit(model, data_module)


  | Name      | Type             | Params | Mode
------------------------------------------------------
0 | rnn       | RNNCell          | 480    | eval
1 | fc        | Linear           | 204    | eval
2 | criterion | CrossEntropyLoss | 0      | eval
------------------------------------------------------
684       Trainable params
0         Non-trainable params
684       Total params
0.003     Total estimated model params size (MB)
0         Modules in train mode
3         Modules in eval mode
/Users/idhibhatpankam/Code/courses/NLP-SYS/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 1: 100%|██████████| 94/94 [00:01<00:00, 55.25it/s, v_num=0]----- Generating text after Epoch: 2
2021-10-21
2024-17-07
2021-32-10
Epoch 3: 100%|██████████| 94/94 [00:01<00:00, 56.57it/s, v_num=0]----- Generating text after Epoch: 4
2021-01-18
2022-03-06
2023-05-08
Epoch 5: 100%|██████████| 94/94 [00:01<00:00, 57.53it/s, v_num=0]----- Generating text after Epoch: 6
2023-10-01
2023-01-27
2021-09-11
Epoch 7: 100%|██████████| 94/94 [00:01<00:00, 54.46it/s, v_num=0]----- Generating text after Epoch: 8
2024-12-27
2022-06-17
2022-06-28
Epoch 9: 100%|██████████| 94/94 [00:01<00:00, 58.78it/s, v_num=0]----- Generating text after Epoch: 10
2024-07-26
2022-04-19
2022-04-29
Epoch 9: 100%|██████████| 94/94 [00:01<00:00, 55.44it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 94/94 [00:01<00:00, 55.17it/s, v_num=0]


In [19]:
for i in range(10):
  print(generate(model))

2024-02-16
2022-04-15
2024-01-08
2023-11-12
2023-12-26
2022-11-13
2021-05-02
2023-06-09
2022-10-12
2024-03-09
