## Testing the dataset module

In [2]:
from src import get_datasetDict, create_mask, default_collate_fn
import torch
from torch.utils.data import DataLoader

In [20]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_datasetDict(train_data=train_data, val_data=val_data, test_data=test_data)

In [22]:
embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
for i, embeddings in enumerate(embeddings_train):
    print (embeddings[:, :SEQ_LENS_TRAIN[i]].shape)

torch.Size([1, 6, 8])
torch.Size([1, 3, 8])
torch.Size([1, 5, 8])
torch.Size([1, 10, 8])
torch.Size([1, 4, 8])
torch.Size([1, 2, 8])


In [23]:
embeddings_train[1].shape

torch.Size([1, 10, 8])

In [24]:
datasetDict['train']

Dataset({
    features: ['embeddings', 'labels', 'seq_num'],
    num_rows: 6
})

In [25]:
torch.tensor(datasetDict['train'][5]['embeddings']).shape, torch.tensor(datasetDict['train'][5]['labels'])

(torch.Size([1, 2, 8]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))

In [26]:
datasetDict_mask = datasetDict.map(create_mask)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [27]:
datasetDict_mask['train'][0:4]['mask']

[[1, 1, 1, 1, 1, 1],
 [1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

In [28]:
train_dataloader = DataLoader(datasetDict_mask['train'], batch_size=3, shuffle=False, collate_fn=default_collate_fn)

In [29]:
nxt = next(iter(train_dataloader))

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [30]:
datasetDict_mask['train'][0]['seq_num']

[0, 1, 2, 3, 4, 5]

In [31]:
nxt['embeddings'].shape

torch.Size([3, 6, 8])

In [32]:
nxt['mask']

tensor([[ True,  True,  True,  True,  True,  True],
        [ True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True, False]])

In [33]:
nxt['labels']

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [34]:
nxt['seq_num']

tensor([[0, 1, 2, 3, 4, 5],
        [0, 1, 2, 0, 0, 0],
        [0, 1, 2, 3, 4, 0]])

In [35]:
nxt['embeddings'][-1]

tensor([[-1.9006,  0.2286,  0.0249, -0.3460,  0.2868, -0.7308,  0.1748, -1.0939],
        [-1.6022,  1.3529,  1.2888,  0.0523, -1.5469,  0.7567,  0.7755,  2.0265],
        [ 0.0358,  0.1206, -0.8057, -0.2076, -0.9319, -1.5910, -1.1360, -0.5226],
        [-0.5188, -1.5013, -1.9267,  0.1279,  1.0229, -0.5558,  0.7043,  0.7099],
        [ 1.7744, -0.9216,  0.9624, -0.3370, -1.1753,  0.3581,  0.4788,  1.3537],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])

In [36]:
torch.empty(3,3)

tensor([[-2.9968e+11,  4.5695e-41,  3.5287e-19],
        [ 3.0740e-41,  4.4842e-44,  0.0000e+00],
        [ 1.5695e-43,  0.0000e+00,  3.4970e-19]])

## Testing MIDataLoaderModule

In [44]:
import torch
from src import get_default_args
from src import get_datasetDict, create_mask, default_collate_fn, recurrent
from src import MIDataLoaderModule

In [38]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_datasetDict(train_data=train_data, val_data=val_data, test_data=test_data)

In [45]:
datasetDict_mask = datasetDict.map(create_mask)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [47]:
dataloader = MIDataLoaderModule(data_args=get_default_args(jupyter=True), datasets=datasetDict_mask)

In [48]:
train_dataloader = dataloader.train_dataloader()

In [49]:
a = next(iter(train_dataloader))

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [50]:
a

{'embeddings': tensor([[[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784, -1.2345, -0.0431,
           -1.6047],
          [-0.7521,  1.6487, -0.3925, -1.4036, -0.7279, -0.5594, -0.7688,
            0.7624],
          [ 1.6423, -0.1596, -0.4974,  0.4396, -0.7581,  1.0783,  0.8008,
            1.6806],
          [ 1.2791,  1.2964,  0.6105,  1.3347, -0.2316,  0.0418, -0.2516,
            0.8599],
          [-1.3847, -0.8712, -0.2234,  1.7174,  0.3189, -0.4245,  0.3057,
           -0.7746],
          [-1.5576,  0.9956, -0.8798, -0.6011, -1.2742,  2.1228, -1.2347,
           -0.4879],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000]],
 
         [[ 0.0109, -0.3387, -1.3

## Testing the model

In [51]:
import torch
from torch.utils.data import DataLoader
from src import get_datasetDict, create_mask, default_collate_fn, recurrent

In [55]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_datasetDict(train_data=train_data, val_data=val_data, test_data=test_data)

In [56]:
datasetDict_mask = datasetDict.map(create_mask)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [57]:
train_dataloader = DataLoader(datasetDict_mask['train'], batch_size=3, shuffle=False, collate_fn=default_collate_fn)

In [58]:
model = recurrent(input_size=8, hidden_size=4, bidirectional=False, num_classes=2)

In [59]:
input_data = next(iter(train_dataloader))
input_rep, mask = input_data['embeddings'], input_data['mask']

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [60]:
input_rep.shape, mask.shape

(torch.Size([3, 6, 8]), torch.Size([3, 6]))

In [62]:
output = model(embeddings=input_rep, mask=mask)

In [63]:
output.shape

torch.Size([3])

In [65]:
output = model(embeddings=input_rep, mask=mask, predict_last_valid_hidden_state=False)

In [66]:
output.shape

torch.Size([3, 6])

In [67]:
mask

tensor([[ True,  True,  True,  True,  True,  True],
        [ True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True, False]])

## Testing the trainer

In [1]:
import torch
import pytorch_lightning as pl
from src import (
    get_default_args,
    get_datasetDict, create_mask, MIDataLoaderModule,
    MILightningModule
)

In [2]:
# parser = argparse.ArgumentParser()
# get_data_args(parser)
# get_model_args(parser)
# get_training_args(parser)
# args = parser.parse_args()
args = get_default_args(jupyter=True)
print (args)

Namespace(ip='127.0.0.1', stdin='9008', control='9006', hb='9005', shell='9007', transport='"tcp"', iopub='9009', f='/users2/avirinchipur/.local/share/jupyter/runtime/kernel-v2-6457bPEAU76Ye9a.json', data_dir=None, train_file=None, dev_file=None, test_file=None, model='gru', input_size=8, num_classes=2, hidden_size=128, num_layers=1, dropout=0.0, bidirectional=False, epochs=10, train_batch_size=32, eval_batch_size=64, cross_entropy_class_weight=None, log_interval=10, save_strategy='best', save_dir=None, lr=0.001, weight_decay=0.0, num_workers=4, seed=42, **{'Session.signature_scheme': '"hmac-sha256"', 'Session.key': 'b"38786ab2-19df-4f95-8885-1b584cd50b4a"'})


In [3]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 10], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 10], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long).unsqueeze(-1).expand(BATCH_SIZE, BATCH_SEQ_LEN)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_datasetDict(train_data=train_data, val_data=val_data, test_data=test_data)

In [4]:
datasetDict = datasetDict.map(create_mask)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [5]:
dataloader = MIDataLoaderModule(args, datasetDict)

In [6]:
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=args.epochs)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:
trainer.max_epochs

10

In [8]:
lightning_module = MILightningModule(args) 

In [9]:
lightning_module

MILightningModule(
  (model): recurrent(
    (model): ModuleList(
      (0): GRU(8, 128, batch_first=True)
      (1): Linear(in_features=128, out_features=1, bias=True)
    )
  )
  (loss): BCEWithLogitsLoss()
)

In [10]:
trainer.fit(lightning_module, datamodule=dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type              | Params
--------------------------------------------
0 | model | recurrent         | 53.1 K
1 | loss  | BCEWithLogitsLoss | 0     
--------------------------------------------
53.1 K    Trainable params
0         Non-trainable params
53.1 K    Total params
0.212     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [11]:
lightning_module.epoch_loss

{'train': [0.6943069696426392,
  0.6897984147071838,
  0.6854313015937805,
  0.6811696887016296,
  0.6769759058952332,
  0.6728121042251587,
  0.6686407923698425,
  0.6644251942634583,
  0.6601274609565735,
  0.6557108163833618],
 'val': [0.6947422623634338,
  0.6961889863014221,
  0.6977480053901672,
  0.6994255185127258,
  0.7012319564819336,
  0.7031817436218262,
  0.7052913904190063,
  0.7075844407081604,
  0.7100932002067566,
  0.7128605246543884,
  0.7159401178359985],
 'test': []}

In [12]:
tr_dl = dataloader.val_dataloader()

In [18]:
batch = next(iter(tr_dl))

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [19]:
isinstance(batch['mask'], torch.BoolTensor)

True

In [20]:
for k, v in batch.items():
    print (k, v.shape)


embeddings torch.Size([6, 9, 8])
labels torch.Size([6, 10])
seq_num torch.Size([6, 9])
mask torch.Size([6, 9])
