## Testing the dataset module

In [1]:
from src import get_dataset, create_mask, default_collate_fn
import torch
from torch.utils.data import DataLoader

In [14]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_dataset(train_data=train_data, val_data=val_data, test_data=test_data)

In [15]:
embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
for i, embeddings in enumerate(embeddings_train):
    print (embeddings[:, :SEQ_LENS_TRAIN[i]].shape)

torch.Size([1, 6, 8])
torch.Size([1, 3, 8])
torch.Size([1, 5, 8])
torch.Size([1, 10, 8])
torch.Size([1, 4, 8])
torch.Size([1, 2, 8])


In [16]:
embeddings_train[1].shape

torch.Size([1, 10, 8])

In [17]:
datasetDict['train']

Dataset({
    features: ['embeddings', 'labels', 'seq_num'],
    num_rows: 6
})

In [18]:
torch.tensor(datasetDict['train'][5]['embeddings']).shape

torch.Size([1, 2, 8])

In [19]:
datasetDict_mask = datasetDict.map(create_mask)

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

In [20]:
datasetDict_mask['train'][0:4]['mask']

[[1, 1, 1, 1, 1, 1],
 [1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

In [21]:
train_dataloader = DataLoader(datasetDict_mask['train'], batch_size=3, shuffle=False, collate_fn=default_collate_fn)

In [22]:
nxt = next(iter(train_dataloader))

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [23]:
datasetDict_mask['train'][0]['seq_num']

[0, 1, 2, 3, 4, 5]

In [24]:
nxt['embeddings'].shape

torch.Size([3, 6, 8])

In [25]:
nxt['mask']

tensor([[ 1,  1,  1,  1,  1,  1],
        [ 1,  1,  1, -1, -1, -1],
        [ 1,  1,  1,  1,  1, -1]])

In [26]:
nxt['labels']

tensor([[ 1,  1,  1,  1,  1,  1],
        [ 1,  1,  1, -1, -1, -1],
        [ 0,  0,  0,  0,  0, -1]])

In [27]:
nxt['seq_num']

tensor([[ 0,  1,  2,  3,  4,  5],
        [ 0,  1,  2, -1, -1, -1],
        [ 0,  1,  2,  3,  4, -1]])

In [28]:
nxt['embeddings'][-1]

tensor([[-1.9006,  0.2286,  0.0249, -0.3460,  0.2868, -0.7308,  0.1748, -1.0939],
        [-1.6022,  1.3529,  1.2888,  0.0523, -1.5469,  0.7567,  0.7755,  2.0265],
        [ 0.0358,  0.1206, -0.8057, -0.2076, -0.9319, -1.5910, -1.1360, -0.5226],
        [-0.5188, -1.5013, -1.9267,  0.1279,  1.0229, -0.5558,  0.7043,  0.7099],
        [ 1.7744, -0.9216,  0.9624, -0.3370, -1.1753,  0.3581,  0.4788,  1.3537],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])

In [29]:
torch.empty(3,3)

tensor([[-5.7076e-02,  3.0798e-41, -5.7342e-02],
        [ 3.0798e-41, -5.7051e-02,  3.0798e-41],
        [-1.2028e-05,  3.0798e-41, -6.4062e-02]])

## Testing MIDataLoaderModule

In [2]:
import torch
from src import get_dataset, create_mask, default_collate_fn, recurrent
from src import MIDataLoaderModule

In [2]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_dataset(train_data=train_data, val_data=val_data, test_data=test_data)

In [3]:
datasetDict_mask = datasetDict.map(create_mask)

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

In [4]:
dataloader = MIDataLoaderModule(data_args=None, datasets=datasetDict_mask)

In [15]:
train_dataloader = dataloader.train_dataloader()

In [16]:
a = next(iter(train_dataloader))

In [17]:
a

{'embeddings': tensor([[[-1.9006,  0.2286,  0.0249, -0.3460,  0.2868, -0.7308,  0.1748,
           -1.0939],
          [-1.6022,  1.3529,  1.2888,  0.0523, -1.5469,  0.7567,  0.7755,
            2.0265],
          [ 0.0358,  0.1206, -0.8057, -0.2076, -0.9319, -1.5910, -1.1360,
           -0.5226],
          [-0.5188, -1.5013, -1.9267,  0.1279,  1.0229, -0.5558,  0.7043,
            0.7099],
          [ 1.7744, -0.9216,  0.9624, -0.3370, -1.1753,  0.3581,  0.4788,
            1.3537]]]),
 'labels': tensor([[0, 0, 0, 0, 0]]),
 'seq_num': tensor([[0, 1, 2, 3, 4]]),
 'mask': tensor([[1, 1, 1, 1, 1]])}

## Testing the model

In [3]:
import torch
from torch.utils.data import DataLoader
from src import get_dataset, create_mask, default_collate_fn, recurrent

In [4]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_dataset(train_data=train_data, val_data=val_data, test_data=test_data)

In [5]:
datasetDict_mask = datasetDict.map(create_mask)

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

In [6]:
train_dataloader = DataLoader(datasetDict_mask['train'], batch_size=3, shuffle=False, collate_fn=default_collate_fn)

In [7]:
model = recurrent(input_size=8, hidden_size=4, bidirectional=False, num_classes=2)

In [8]:
input_data = next(iter(train_dataloader))
input_rep, mask = input_data['embeddings'], input_data['mask']

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [9]:
input_rep.shape, mask.shape

(torch.Size([3, 6, 8]), torch.Size([3, 6]))

In [10]:
output = model(input_rep=input_rep, mask=mask)

In [11]:
output.shape

In [12]:
output = model(input_rep=input_rep, mask=mask, predict_last_valid_hidden_state=False)

In [13]:
output.shape

In [14]:
mask

tensor([[ True,  True,  True,  True,  True,  True],
        [ True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True, False]])

## Testing the trainer

In [1]:
import argparse
import torch
import pytorch_lightning as pl
from src import (
    get_data_args, get_model_args, get_training_args, get_default_args,
    get_dataset, create_mask, MIDataLoaderModule,
    MILightningModule
)

In [2]:
# parser = argparse.ArgumentParser()
# get_data_args(parser)
# get_model_args(parser)
# get_training_args(parser)
# args = parser.parse_args()
args = get_default_args(jupyter=True)
print (args)

Namespace(ip='127.0.0.1', stdin='9008', control='9006', hb='9005', shell='9007', transport='"tcp"', iopub='9009', f='/users2/avirinchipur/.local/share/jupyter/runtime/kernel-v2-7102eEPBjpVfeR5r.json', data_dir=None, train_file=None, dev_file=None, test_file=None, model='gru', input_size=8, num_classes=2, hidden_size=128, num_layers=1, dropout=0.0, bidirectional=False, epochs=10, train_batch_size=32, eval_batch_size=64, cross_entropy_class_weight=None, log_interval=10, save_strategy='best', save_dir=None, lr=0.001, weight_decay=0.0, num_workers=4, seed=42, **{'Session.signature_scheme': '"hmac-sha256"', 'Session.key': 'b"e4658aae-f44c-4eb2-b9b0-128d996e7c25"'})


In [3]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_dataset(train_data=train_data, val_data=val_data, test_data=test_data)

In [4]:
datasetDict = datasetDict.map(create_mask)

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

In [5]:
dataloader = MIDataLoaderModule(args, datasetDict)

In [10]:
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=args.epochs)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
lightning_module = MILightningModule(args) 

In [12]:
lightning_module

MITrainer(
  (model): recurrent(
    (model): ModuleList(
      (0): GRU(8, 128, batch_first=True)
      (1): Linear(in_features=128, out_features=1, bias=True)
    )
  )
  (loss): BCEWithLogitsLoss()
)

In [13]:
trainer.fit(lightning_module, datamodule=dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type              | Params
--------------------------------------------
0 | model | recurrent         | 53.1 K
1 | loss  | BCEWithLogitsLoss | 0     
--------------------------------------------
53.1 K    Trainable params
0         Non-trainable params
53.1 K    Total params
0.212     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [10]:
tr_dl = dataloader.train_dataloader()

In [17]:
batch = next(iter(tr_dl))

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [18]:
isinstance(batch['mask'], torch.BoolTensor)

True

In [10]:
torch.randn(6,9,2).squeeze(-1).shape

torch.Size([6, 9, 2])