In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import utils.data as data

train, test = data.get_train_test_split(0.01, 42)

In [3]:
dataset = data.Marconi100Dataset(test)
df, lab = dataset[0]

Loading: 100%|██████████| 3/3 [00:03<00:00,  1.11s/it]


In [4]:
lab.unique()

array([0, 1])

In [5]:
df.shape

(13195, 460)

In [6]:
import torch

tensor = torch.ones(df.shape)
print(tensor.shape)

torch.Size([13195, 460])


In [7]:
import numpy as np

def generate_masks(M):
    masks = []
    for i in range(0, M):
        x = tensor.clone().detach()

        #get column indices for the features to mask
        col_idxs = np.random.choice(range(0, df.shape[1]), 10, replace=False)
        #get row indices for the samples to mask
        row_idxs = np.random.choice(range(0, df.shape[0]), 10, replace=False)
        if i == 0:
            print("col_idxs: ", col_idxs)
            print("row_idxs: ", row_idxs)
            for i in row_idxs:
                for j in col_idxs:
                    x[i][j] = 0

        mask = x > 0
        masks.append(mask)
    return masks

def generate_mask_V1(shape, n):
    
    # inefficiente
    n_mask = int(np.prod(shape) / n)
    
    mask = torch.ones(shape)
    print(np.count_nonzero(mask == 0), n_mask)
    
    while np.count_nonzero(mask == 0) < n_mask:
        
        # get column index for the element to mask
        col_idx = np.random.choice(range(shape[1]), replace=False)
        # get row index for the element to mask
        row_idx = np.random.choice(range(shape[0]), replace=False)
        mask[row_idx, col_idx] = 0
        
    return mask

def generate_mask_V2(shape, n):
    
    # molto più efficiente
    
    mask = torch.ones(np.prod(shape))
    
    n_mask = int(np.prod(shape) / n)
    
    # make sure as many as n_mask samples are masked
    mask[:n_mask] = 0
    
    mask = torch.reshape(mask, shape)
    
    # permute rows
    mask = np.random.permutation(mask)
    
    # permute columns
    mask = mask[:, np.random.permutation(mask.shape[1])]
    
    return mask

def generate_mask_V3(shape, n):
    
    # un po' più efficiente
    mask = np.ones(np.prod(shape))
    n_mask = int(np.prod(shape) / n)
    # make sure as many as n_mask samples are masked
    mask[:n_mask] = 0

    # permute elements
    mask = np.random.permutation(mask)

    # reshape to the input shape
    mask = torch.tensor(mask).reshape(shape)
    
    return mask

In [15]:
generate_mask_V3((5, 5), 2)

tensor([[1., 1., 1., 1., 0.],
        [0., 1., 1., 1., 0.],
        [0., 0., 1., 0., 1.],
        [0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 1.]], dtype=torch.float64)

In [73]:
def generate_non_overlapping_masks(shape, n):
    masks = []
    prod = np.prod(shape)
    n_mask = int(prod / n)
    # set are much more efficient at removing
    not_used = set(i for i in range(prod))

    while len(masks) < n:
        mask = np.ones(prod)
        # choose from the aviable indices
        idxs = np.random.choice(tuple(not_used), n_mask, replace=False)
        # set to 0
        mask[idxs] = 0
        # mark as used
        not_used = not_used.difference(idxs)
        # reshape to the input shape
        mask = torch.tensor(mask).reshape(shape)
        masks.append(mask)
    
    masks = torch.stack(masks)
    assert ((masks.size(0) - masks.sum(0)) <= 1).all(), "Something wrong"
    return masks

In [76]:
%%timeit
s = set(range(100))
c = list(range(0, 100, 3))

for _ in range(100):
    a = s.difference(c)

218 µs ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [81]:
%%timeit
s = set(range(100))
c = list(range(0, 100, 3))

for _ in range(100):
    s2 = s.copy()
    for i in c:
        s2.remove(i)

526 µs ± 125 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [82]:
%%timeit
s = list(range(100))
c = list(range(0, 100, 3))

for _ in range(100):
    s2 = s.copy()
    for i in c:
        s2.remove(i)

2.3 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
masks = generate_non_overlapping_masks((1024, 416), 20)

In [67]:
masks.size()

torch.Size([20, 1024, 416])

In [83]:
tensor.shape

torch.Size([13195, 460])

In [87]:
res = tensor.unfold(0, 1024, 512).permute(0, 2, 1)
print(res.shape)

torch.Size([24, 1024, 460])


In [111]:
masks = generate_mask_V1(tensor.shape, 2000)
# first_mask = masks[0]
# print indices of holes
# print(torch.logical_not(first_mask).nonzero())

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 11.4 µs
0 3078


In [115]:
mask = generate_mask_V2(tensor.shape, 2000)
print(np.count_nonzero(mask == 0))
print(np.count_nonzero(mask == 0) + np.count_nonzero(mask) == np.prod(mask.shape))
# print indices of holes
# print(torch.logical_not(mask).nonzero())

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 15.7 µs
tensor([1., 1., 1.,  ..., 1., 1., 1.])
tensor([0., 0., 0.,  ..., 1., 1., 1.])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])
3078
True


In [4]:
from deep_fib.data import unfolded_indexes

idxs = unfolded_indexes(dataset, 1024, 512)
for _, (s, e) in idxs:
    assert e - s == 1024
len(idxs)

79

In [5]:
idxs0 = list(filter(lambda d: d[0] == 0, idxs))
print(dataset[0][0].shape)
idxs0

(13195, 460)


[(0, (0, 1024)),
 (0, (512, 1536)),
 (0, (1024, 2048)),
 (0, (1536, 2560)),
 (0, (2048, 3072)),
 (0, (2560, 3584)),
 (0, (3072, 4096)),
 (0, (3584, 4608)),
 (0, (4096, 5120)),
 (0, (4608, 5632)),
 (0, (5120, 6144)),
 (0, (5632, 6656)),
 (0, (6144, 7168)),
 (0, (6656, 7680)),
 (0, (7168, 8192)),
 (0, (7680, 8704)),
 (0, (8192, 9216)),
 (0, (8704, 9728)),
 (0, (9216, 10240)),
 (0, (9728, 10752)),
 (0, (10240, 11264)),
 (0, (10752, 11776)),
 (0, (11264, 12288)),
 (0, (11776, 12800)),
 (0, (12170, 13194))]

In [5]:
import random

masks = range(0, 19)

random_sample = random.choices(masks, k = 32)
print(random_sample)

[7, 6, 12, 6, 12, 17, 12, 0, 4, 3, 4, 10, 16, 16, 0, 7, 2, 11, 4, 0, 10, 7, 0, 12, 14, 7, 4, 18, 16, 13, 14, 13]


In [15]:
inputs = [np.ones((5, 5)) for i in range(0, 32)]
len(inputs)

32

In [16]:
print(inputs[0])

[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]


In [18]:
inputs = np.array(inputs)
print(inputs.shape)

(32, 5, 5)


In [19]:
print(len(inputs))

32


In [20]:
count = 0
for (mask, elem) in zip(random_sample, inputs):
    print("apply mask: ", mask)
    count += 1
print(count)

apply mask:  7
apply mask:  6
apply mask:  12
apply mask:  6
apply mask:  12
apply mask:  17
apply mask:  12
apply mask:  0
apply mask:  4
apply mask:  3
apply mask:  4
apply mask:  10
apply mask:  16
apply mask:  16
apply mask:  0
apply mask:  7
apply mask:  2
apply mask:  11
apply mask:  4
apply mask:  0
apply mask:  10
apply mask:  7
apply mask:  0
apply mask:  12
apply mask:  14
apply mask:  7
apply mask:  4
apply mask:  18
apply mask:  16
apply mask:  13
apply mask:  14
apply mask:  13
32
