In [None]:
!pip3 install ncps transformers datasets pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ncps
  Downloading ncps-0.0.7-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning
  Downloading pytorch_lightning-2.0.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-no

In [None]:
import torch
from torch.utils.data import DataLoader
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from ncps.torch import CfC
from ncps.wirings import AutoNCP, Random
import transformers
import datasets
from transformers import BertTokenizer, BertTokenizerFast
from datasets import load_dataset
import torch.nn.functional as F
import time
# Load the TensorBoard notebook extension
%load_ext tensorboard

### Load the dataset

The models are trained on a subset of the wmt14 english to german translation dataset. You can change the subset size and number of epochs

In [None]:
num_iterations = 100 #@param {type:"integer"}
num_epochs = 5 #@param {type:"integer"}

In [None]:
ds = load_dataset('wmt14', 'de-en')

Downloading builder script:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.37k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/41.2k [00:00<?, ?B/s]

Downloading and preparing dataset wmt14/de-en to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Dataset wmt14 downloaded and prepared to /root/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


In [None]:
# load the dataset into dataloader
small_ds = [ds['train'][i] for i in range(120000)]

train_loader = DataLoader(small_ds, batch_size=32, num_workers=4)
valid_loader = DataLoader(ds['validation'], batch_size=32, num_workers=4) #3000 examples

### Closed-form Continuous Networks for Machine Translation
We experiment on the use of CfCs to serve as the bottleneck layer for a CLIP encoder-decoder architecture.

In [None]:
from transformers import BertTokenizer
# LightningModule for training a RNNSequence module

class LiquidRNN(pl.LightningModule):
  def __init__(self, cfc, d_model, sample=False):
    super(LiquidRNN, self).__init__()
    embedding_size = 50257
    self.embedding = torch.nn.Embedding(embedding_size, d_model)
    self.cfc = cfc
    self.decoder = nn.Sequential(
         torch.nn.Linear(d_model, d_model),
         torch.nn.LeakyReLU(),
         torch.nn.Linear(d_model, embedding_size),
    )
    self.embedding_size = embedding_size
    self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

  def forward(self, x):
    encoded_input = self.embedding(x)
    encoded_output, _ = self.cfc(encoded_input)
    B, N, C = encoded_output.shape
    decoded_output = self.decoder(encoded_input.view(-1, C))
    return decoded_output.view(B, N, self.embedding_size)

  def on_training_epoch_end(self, outputs):
    avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
    self.logger.experiment.add_scalar("Loss/Train",avg_loss,self.current_epoch)
    epoch_dictionary={'loss': avg_loss}
    return epoch_dictionary

  def training_step(self, batch, batch_idx):

    t1 = time.perf_counter()

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    t2 = time.perf_counter()
    self.log('train_loss', loss)
    self.log('train_runtime', t2 - t1)
    return { 'loss': loss }
  
  def validation_step(self, batch, batch_idx):

    t1 = time.perf_counter()

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    t2 = time.perf_counter()
    self.log('valid_loss', loss, batch_size=32)
    self.log('valid_runtime', t2 - t1, batch_size=32)
    return { 'loss': loss }

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(), lr=3e-4, weight_decay=0.3)

In [None]:
import gc
gc.collect()

0

In [None]:
cfc_logger = TensorBoardLogger(save_dir='/content', name="cfc", log_graph=False, default_hp_metric=False, flush_secs=5)
trainer = pl.Trainer(default_root_dir='/content',max_epochs=num_epochs, logger=cfc_logger, log_every_n_steps=1)
#wiring = Random(128, 128)
model = CfC(128, 128)
liquid_rnn = LiquidRNN(model, 128)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
t1_liquid = time.perf_counter()
trainer.fit(liquid_rnn, train_loader, valid_loader)
t2_liquid = time.perf_counter()
time_liquid = t2_liquid - t1_liquid
print(f'total time for liquid: {time_liquid}ms')

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type       | Params
-----------------------------------------
0 | embedding | Embedding  | 6.4 M 
1 | cfc       | CfC        | 98.9 K
2 | decoder   | Sequential | 6.5 M 
-----------------------------------------
13.0 M    Trainable params
0         Non-trainable params
13.0 M    Total params
52.126    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### RNN Baseline

In [None]:
from transformers import BertTokenizer
from torch.nn import RNN
# LightningModule for training a RNNSequence module

class RNNModel(pl.LightningModule):
  def __init__(self, rnn, d_model, sample=False):
    super(RNNModel, self).__init__()
    embedding_size = 50257
    self.embedding = torch.nn.Embedding(embedding_size, d_model)
    self.rnn = rnn
    self.decoder = nn.Sequential(
         torch.nn.Linear(d_model, d_model),
         torch.nn.LeakyReLU(),
         torch.nn.Linear(d_model, embedding_size),
    )
    self.embedding_size = embedding_size
    self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

  def forward(self, x):
    encoded_input = self.embedding(x)
    encoded_output, _ = self.rnn(encoded_input)
    B, N, C = encoded_output.shape
    decoded_output = self.decoder(encoded_input.view(-1, C))
    return decoded_output.view(B, N, self.embedding_size)

  def on_training_epoch_end(self, outputs):
    avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
    self.logger.experiment.add_scalar("Loss/Train",avg_loss,self.current_epoch)
    epoch_dictionary={'loss': avg_loss}
    return epoch_dictionary
 
  def training_step(self, batch, batch_idx):

    t1 = time.perf_counter()

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    t2 = time.perf_counter()
    self.log('train_loss', loss)
    self.log('train_runtime', t2 - t1)
    return { 'loss': loss }
  
  def validation_step(self, batch, batch_idx):

    t1 = time.perf_counter()

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    t2 = time.perf_counter()
    self.log('valid_loss', loss, batch_size=32)
    self.log('valid_runtime', t2 - t1, batch_size=32)
    return { 'loss': loss }

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(), lr=3e-4, weight_decay=0.3)

In [None]:
cfc_logger = TensorBoardLogger(save_dir='/content', name="rnn", log_graph=False, default_hp_metric=False, flush_secs=5)
trainer = pl.Trainer(default_root_dir='/content',max_epochs=5, logger=cfc_logger, log_every_n_steps=1)
#wiring = Random(128, 128)
model = RNN(128, 128)
rnn = RNNModel(model, 128)

In [None]:
t1_rnn = time.perf_counter()
trainer.fit(rnn, train_loader, valid_loader)
t2_rnn = time.perf_counter()
time_rnn = t2_rnn - t1_rnn
print(f'total time for rnn: {time_rnn}ms')

### LSTM Baseline

In [None]:
from torch.nn import LSTM
# LightningModule for training a LSTM module

class LSTMModel(pl.LightningModule):
  def __init__(self, lstm, d_model, sample=False):
    super(LSTMModel, self).__init__()
    embedding_size = 50257
    self.embedding = torch.nn.Embedding(embedding_size, d_model)
    self.lstm = lstm
    self.decoder = nn.Sequential(
         torch.nn.Linear(d_model, d_model),
         torch.nn.LeakyReLU(),
         torch.nn.Linear(d_model, embedding_size),
    )
    self.embedding_size = embedding_size
    self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

  def forward(self, x):
    encoded_input = self.embedding(x)
    encoded_output, _ = self.lstm(encoded_input)
    B, N, C = encoded_output.shape
    decoded_output = self.decoder(encoded_input.view(-1, C))
    return decoded_output.view(B, N, self.embedding_size)
  def training_step(self, batch, batch_idx):
    t1 = time.perf_counter()

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    t2 = time.perf_counter()
    self.log('train_loss', loss)
    self.log('train_runtime', t2 - t1)
    return { 'loss': loss }
  
  def validation_step(self, batch, batch_idx):

    t1 = time.perf_counter()

    x = self.tokenizer(batch['translation']['de'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y = self.tokenizer(batch['translation']['en'], padding="max_length", max_length=256, truncation='longest_first', return_tensors='pt')['input_ids'].cuda()
    y_hat = self(x)

    N, L, C = y_hat.shape
    y_hat = y_hat.view(-1, C)
    y = y.view((-1,))

    loss = F.cross_entropy(y_hat, y)

    t2 = time.perf_counter()
    self.log('valid_loss', loss, batch_size=32)
    self.log('valid_runtime', t2 - t1, batch_size=32)
    return { 'loss': loss }

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(), lr=3e-4, weight_decay=0.3)

In [None]:
cfc_logger = TensorBoardLogger(save_dir='/content', name="lstm", log_graph=False, default_hp_metric=False, flush_secs=5)
trainer = pl.Trainer(default_root_dir='/content',max_epochs=5, logger=cfc_logger, log_every_n_steps=1)
#wiring = Random(128, 128)
model = LSTM(128, 128)
lstm = LSTMModel(model, 128)

In [None]:
t1_lstm = time.perf_counter()
trainer.fit(lstm, train_loader, valid_loader)
t2_lstm = time.perf_counter()
time_lstm = t2_lstm - t1_lstm
print(f'total time for lstm: {time_lstm}ms')

### View Results

In [None]:
print(f'total execution time for liquid rnn: {time_liquid}')
print(f'total execution time for rnn: {time_rnn}')
print(f'total execution time for lstm: {time_lstm}')

In [None]:
!find logs/cfc/version_0 | grep tfevents

find: ‘logs/cfc/version_0’: No such file or directory


In [None]:
%tensorboard --inspect --logdir /content/

In [None]:
# view results in tensorboard
%tensorboard --logdir /content --port 8013

In [None]:
!kill 4697

/bin/bash: line 0: kill: (4697) - No such process
