In [None]:
!pip3 install ncps transformers datasets pytorch-lightning

In [None]:
import torch
from torch.utils.data import DataLoader
from torch import nn
import pytorch_lightning as pl
from ncps.torch import CfC
from ncps.wirings import AutoNCP, Random
import transformers
import datasets
from transformers import BertTokenizer, BertTokenizerFast
from datasets import load_dataset
import torch.nn.functional as F

### Load the dataset

The models are trained on a subset of the wmt14 english to german translation dataset. You can change the subset size and number of epochs

In [None]:
num_iterations = 100 #@param {type:"integer"}
num_epochs = 10 #@param {type:"integer"}

In [None]:
ds = load_dataset('wmt14', 'de-en')

Downloading and preparing dataset wmt14/de-en to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Dataset wmt14 downloaded and prepared to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


In [None]:
# load the dataset into dataloader
train_loader = DataLoader(ds['train'], batch_size=32)
valid_loader = DataLoader(ds['validation'], batch_size=32)
test_loader = DataLoader(ds['test'], batch_size=32)

In [None]:
print(ds)

### Closed-form Continuous Networks for Machine Translation
We experiment on the use of CfCs to serve as the bottleneck layer for a CLIP encoder-decoder architecture.

In [None]:
# import wmt2014 dataset
ds = datasets.load_dataset('wmt14', 'de-en')

Downloading builder script:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.37k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/41.2k [00:00<?, ?B/s]

Downloading and preparing dataset wmt14/de-en to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Dataset wmt14 downloaded and prepared to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from transformers import BertTokenizer
# LightningModule for training a RNNSequence module

class LiquidRNN(pl.LightningModule):
  def __init__(self, cfc, d_model, sample=False):
    super(LiquidRNN, self).__init__()
    embedding_size = 50257
    self.embedding = torch.nn.Embedding(embedding_size, d_model)
    self.cfc = cfc
    self.decoder = nn.Sequential(
         torch.nn.Linear(d_model, d_model),
         torch.nn.LeakyReLU(),
         torch.nn.Linear(d_model, embedding_size),
    )
    self.embedding_size = embedding_size
    self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

  def forward(self, x):
    encoded_input = self.embedding(x)
    encoded_output, _ = self.cfc(encoded_input)
    B, N, C = encoded_output.shape
    decoded_output = self.decoder(encoded_input.view(-1, C))
    return decoded_output.view(B, N, self.embedding_size)

  def training_step(self, batch, batch_idx):

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    tqdm_dict = {'train_loss': loss}
    outputs = {
            'loss': loss,
            'progress_bar': tqdm_dict,
            'log': tqdm_dict
    }

    print(loss)
    return outputs

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(), lr=3e-4, weight_decay=0.3)

In [None]:
trainer = pl.Trainer(max_epochs=10, max_steps=num_iterations)
#wiring = Random(128, 128)
model = CfC(128, 128)
liquid_rnn = LiquidRNN(model, 128)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(liquid_rnn, train_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type       | Params
-----------------------------------------
0 | embedding | Embedding  | 6.4 M 
1 | cfc       | CfC        | 98.9 K
2 | decoder   | Sequential | 6.5 M 
-----------------------------------------
13.0 M    Trainable params
0         Non-trainable params
13.0 M    Total params
52.126    Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

tensor(10.7195, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.6299, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.5692, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.4697, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.3779, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.3721, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.2398, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.1871, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.0934, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.0351, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.9690, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.9606, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.8021, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7498, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6033, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6217, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.5127, device='cuda:0'

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=100` reached.


### RNN Baseline

In [None]:
from torch.nn import RNN
# LightningModule for training a LSTM module

class RNNModel(pl.LightningModule):
  def __init__(self, rnn, d_model, sample=False):
    super(RNNModel, self).__init__()
    embedding_size = 50257
    self.embedding = torch.nn.Embedding(embedding_size, d_model)
    self.rnn = rnn
    self.decoder = nn.Sequential(
         torch.nn.Linear(d_model, d_model),
         torch.nn.LeakyReLU(),
         torch.nn.Linear(d_model, embedding_size),
    )
    self.embedding_size = embedding_size
    self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

  def forward(self, x):
    encoded_input = self.embedding(x)
    encoded_output, _ = self.rnn(encoded_input)
    B, N, C = encoded_output.shape
    decoded_output = self.decoder(encoded_input.view(-1, C))
    return decoded_output.view(B, N, self.embedding_size)

  def training_step(self, batch, batch_idx):

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    tqdm_dict = {'train_loss': loss}
    outputs = {
            'loss': loss,
            'progress_bar': tqdm_dict,
            'log': tqdm_dict
    }

    print(loss)
    return outputs

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(), lr=3e-4, weight_decay=0.3)

In [None]:
trainer = pl.Trainer(max_steps=num_iterations)
#wiring = Random(128, 128)
model = RNN(128, 128)
rnn = RNNModel(model, 128)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(rnn, train_loader)

### LSTM Baseline

In [None]:
from torch.nn import LSTM
# LightningModule for training a LSTM module

class LSTMModel(pl.LightningModule):
  def __init__(self, lstm, d_model, sample=False):
    super(LSTMModel, self).__init__()
    embedding_size = 50257
    self.embedding = torch.nn.Embedding(embedding_size, d_model)
    self.lstm = lstm
    self.decoder = nn.Sequential(
         torch.nn.Linear(d_model, d_model),
         torch.nn.LeakyReLU(),
         torch.nn.Linear(d_model, embedding_size),
    )
    self.embedding_size = embedding_size
    self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

  def forward(self, x):
    encoded_input = self.embedding(x)
    encoded_output, _ = self.lstm(encoded_input)
    B, N, C = encoded_output.shape
    decoded_output = self.decoder(encoded_input.view(-1, C))
    return decoded_output.view(B, N, self.embedding_size)

  def training_step(self, batch, batch_idx):

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    tqdm_dict = {'train_loss': loss}
    outputs = {
            'loss': loss,
            'progress_bar': tqdm_dict,
            'log': tqdm_dict
    }

    print(loss)
    return outputs

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(), lr=3e-4, weight_decay=0.3)

In [None]:
trainer = pl.Trainer(max_epochs=10)
#wiring = Random(128, 128)
model = LSTM(128, 128)
rnn = LSTMModel(model, 128)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(rnn, train_loader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type       | Params
-----------------------------------------
0 | embedding | Embedding  | 6.4 M 
1 | lstm      | LSTM       | 132 K 
2 | decoder   | Sequential | 6.5 M 
-----------------------------------------
13.1 M    Trainable params
0         Non-trainable params
13.1 M    Total params
52.259    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

tensor(11.0781, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.9941, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.9102, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.8163, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.7265, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.6856, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.5814, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.5154, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.4283, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.3654, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.3108, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.2911, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.1573, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(10.1103, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.9809, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.9884, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.8930, device='cud

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


### Comparison (Loss curve, accuracy, time per iteration)

In [None]:
# compare the loss

In [None]:
# compare time per iteration

In [None]:
# 