## Testing the dataset module

In [1]:
from src import get_dataset, create_mask, default_collate_fn
import torch
from torch.utils.data import DataLoader

In [14]:
BATCH_SIZE = 6
BATCH_SEQ_LEN = 10
FEAT_DIM = 8
SEQ_LENS_TRAIN = torch.tensor([6, 3, 5, 10, 4, 2], dtype=torch.long)
SEQ_LENS_VAL = torch.tensor([4, 2, 7, 8, 9, 3], dtype=torch.long)
SEQ_LENS_TEST = torch.tensor([5, 3, 6, 9, 5, 7], dtype=torch.long)
torch.manual_seed(42)

embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_train = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TRAIN[i])) for i in range(BATCH_SIZE)]
embeddings_train = [embeddings[:, :SEQ_LENS_TRAIN[i]] for i, embeddings in enumerate(embeddings_train)]
train_data = {'embeddings': embeddings_train, 'labels': labels_train, 'seq_num': seq_lens}

embeddings_val = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_val = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_VAL[i])) for i in range(BATCH_SIZE)]
embeddings_val = [embeddings[:, :SEQ_LENS_VAL[i]] for i, embeddings in enumerate(embeddings_val)]
val_data = {'embeddings': embeddings_val, 'labels': labels_val, 'seq_num': seq_lens}

embeddings_test = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
labels_test = torch.randint(0, 2, (BATCH_SIZE, ), dtype=torch.long)
seq_lens = [list(range(SEQ_LENS_TEST[i])) for i in range(BATCH_SIZE)]
embeddings_test = [embeddings[:, :SEQ_LENS_TEST[i]] for i, embeddings in enumerate(embeddings_test)]
test_data = {'embeddings': embeddings_test, 'labels': labels_test, 'seq_num': seq_lens}

datasetDict = get_dataset(train_data=train_data, val_data=val_data, test_data=test_data)

In [15]:
embeddings_train = torch.randn(BATCH_SIZE, 1, BATCH_SEQ_LEN, FEAT_DIM)
for i, embeddings in enumerate(embeddings_train):
    print (embeddings[:, :SEQ_LENS_TRAIN[i]].shape)

torch.Size([1, 6, 8])
torch.Size([1, 3, 8])
torch.Size([1, 5, 8])
torch.Size([1, 10, 8])
torch.Size([1, 4, 8])
torch.Size([1, 2, 8])


In [16]:
embeddings_train[1].shape

torch.Size([1, 10, 8])

In [17]:
datasetDict['train']

Dataset({
    features: ['embeddings', 'labels', 'seq_num'],
    num_rows: 6
})

In [18]:
torch.tensor(datasetDict['train'][5]['embeddings']).shape

torch.Size([1, 2, 8])

In [19]:
datasetDict_mask = datasetDict.map(create_mask)

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

In [20]:
datasetDict_mask['train'][0:4]['mask']

[[1, 1, 1, 1, 1, 1],
 [1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

In [21]:
train_dataloader = DataLoader(datasetDict_mask['train'], batch_size=3, shuffle=False, collate_fn=default_collate_fn)

In [22]:
nxt = next(iter(train_dataloader))

  batch[k] = torch.cat([torch.tensor(f[k]) for f in features], dim=0)


In [23]:
datasetDict_mask['train'][0]['seq_num']

[0, 1, 2, 3, 4, 5]

In [24]:
nxt['embeddings'].shape

torch.Size([3, 6, 8])

In [25]:
nxt['mask']

tensor([[ 1,  1,  1,  1,  1,  1],
        [ 1,  1,  1, -1, -1, -1],
        [ 1,  1,  1,  1,  1, -1]])

In [26]:
nxt['labels']

tensor([[ 1,  1,  1,  1,  1,  1],
        [ 1,  1,  1, -1, -1, -1],
        [ 0,  0,  0,  0,  0, -1]])

In [27]:
nxt['seq_num']

tensor([[ 0,  1,  2,  3,  4,  5],
        [ 0,  1,  2, -1, -1, -1],
        [ 0,  1,  2,  3,  4, -1]])

In [28]:
nxt['embeddings'][-1]

tensor([[-1.9006,  0.2286,  0.0249, -0.3460,  0.2868, -0.7308,  0.1748, -1.0939],
        [-1.6022,  1.3529,  1.2888,  0.0523, -1.5469,  0.7567,  0.7755,  2.0265],
        [ 0.0358,  0.1206, -0.8057, -0.2076, -0.9319, -1.5910, -1.1360, -0.5226],
        [-0.5188, -1.5013, -1.9267,  0.1279,  1.0229, -0.5558,  0.7043,  0.7099],
        [ 1.7744, -0.9216,  0.9624, -0.3370, -1.1753,  0.3581,  0.4788,  1.3537],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])