In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import torch
import numpy as np
import random

from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split


from deep_fib.sci_net import SCIBlockCfg, SCINet
from deep_fib.data import DeepFIBDataset, get_masks
from deep_fib.core import DeepFIBEngine

from utils.data import Marconi100Dataset, get_dataset_paths
from utils.training import training_loop
from utils.summary import SummaryWriter

In [11]:
paths = get_dataset_paths("../data")
train, test = train_test_split(paths, test_size=0.1, random_state=42)

In [12]:
dataset = Marconi100Dataset(test, scaling="minmax")
df, lab = dataset[0]

Loading: 100%|██████████| 25/25 [00:51<00:00,  2.07s/it]


In [13]:
dataset = Marconi100Dataset(test, scaling="minmax")
print(min(dataset[i][0].min().min() for i in range(len(dataset))))
    

Loading: 100%|██████████| 25/25 [00:50<00:00,  2.04s/it]


0.0


In [14]:
dataset = Marconi100Dataset(test, scaling=None)
print(min(dataset[i][0].min().min() for i in range(len(dataset))))

Loading: 100%|██████████| 25/25 [00:02<00:00, 10.91it/s]


-72.0


In [11]:
lab.unique()

array([0, 1])

In [12]:
df.shape

(13383, 460)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

horizon = 1024
stride = 1000
n_masks = 20

batch_size = 32
num_workers = 2

num_encoder_levels = 2

log_dir = "./trash"
lr = 1e-3
num_epochs = 3
step_size = 2

hidden = None
block_cfg = SCIBlockCfg(input_dim=460, hidden_size=4, kernel_size=3, dropout=0.5,)

anomaly_threshold = 0.7

cpu


In [37]:
dataset_train = DeepFIBDataset(
    dataset, horizon=horizon, stride=stride
)
dataset_test = DeepFIBDataset(
    dataset, horizon=horizon, stride=stride
)

In [21]:
masks = get_masks(horizon, n_masks).float()
print(masks.shape)

torch.Size([20, 1024, 460])


In [38]:
train_loader = DataLoader(
    dataset_train,
    batch_size,
    shuffle=True,
    num_workers=num_workers,
    persistent_workers=(num_workers != 0),
)

test_loader = DataLoader(
    dataset_test,
    batch_size,
    shuffle=False,
    num_workers=num_workers,
    persistent_workers=(num_workers != 0),
)

len(train_loader), len(test_loader)

(2, 2)

In [40]:
model = SCINet(
    output_len=horizon,
    input_len=horizon,
    num_encoder_levels=num_encoder_levels,
    hidden_decoder_sizes=hidden,
    block_config=block_cfg,
).float()

engine = DeepFIBEngine(anomaly_threshold, masks)

optim = Adam(model.parameters(), lr=lr)
lr_sched = StepLR(optim, step_size)

with SummaryWriter(log_dir) as writer:
    training_loop(
        model=model,
        engine=engine,
        num_epochs=num_epochs,
        train_dataloader=train_loader,
        test_dataloader=test_loader,
        device=device,
        optimizer=optim,
        lr_scheduler=lr_sched,
        writer=writer,
        save_path=log_dir + "/models",
    )

                                                                                

TypeError: 'builtin_function_or_method' object does not support item assignment

In [6]:
import torch

tensor = torch.ones(df.shape)
print(tensor.shape)

torch.Size([13195, 460])


In [23]:
def generate_masks(M):
    masks = []
    for i in range(0, M):
        x = tensor.clone().detach()

        #get column indices for the features to mask
        col_idxs = np.random.choice(range(0, df.shape[1]), 10, replace=False)
        #get row indices for the samples to mask
        row_idxs = np.random.choice(range(0, df.shape[0]), 10, replace=False)
        if i == 0:
            print("col_idxs: ", col_idxs)
            print("row_idxs: ", row_idxs)
            for i in row_idxs:
                for j in col_idxs:
                    x[i][j] = 0

        mask = x > 0
        masks.append(mask)
    return masks

def generate_mask_V1(shape, n):
    
    # inefficiente
    n_mask = int(np.prod(shape) / n)
    
    mask = torch.ones(shape)
    print(np.count_nonzero(mask == 0), n_mask)
    
    while np.count_nonzero(mask == 0) < n_mask:
        
        # get column index for the element to mask
        col_idx = np.random.choice(range(shape[1]), replace=False)
        # get row index for the element to mask
        row_idx = np.random.choice(range(shape[0]), replace=False)
        mask[row_idx, col_idx] = 0
        
    return mask

def generate_mask_V2(shape, n):
    
    # molto più efficiente
    
    mask = torch.ones(np.prod(shape))
    
    n_mask = int(np.prod(shape) / n)
    
    # make sure as many as n_mask samples are masked
    mask[:n_mask] = 0
    
    mask = torch.reshape(mask, shape)
    
    # permute rows
    mask = np.random.permutation(mask)
    
    # permute columns
    mask = mask[:, np.random.permutation(mask.shape[1])]
    
    return mask

def generate_mask_V3(shape, n):
    
    # un po' più efficiente
    mask = np.ones(np.prod(shape))
    n_mask = int(np.prod(shape) / n)
    # make sure as many as n_mask samples are masked
    mask[:n_mask] = 0

    # permute elements
    mask = np.random.permutation(mask)

    # reshape to the input shape
    mask = torch.tensor(mask).reshape(shape)
    
    return mask

In [15]:
generate_mask_V3((5, 5), 2)

tensor([[1., 1., 1., 1., 0.],
        [0., 1., 1., 1., 0.],
        [0., 0., 1., 0., 1.],
        [0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 1.]], dtype=torch.float64)

In [73]:
def generate_non_overlapping_masks(shape, n):
    masks = []
    prod = np.prod(shape)
    n_mask = int(prod / n)
    # set are much more efficient at removing
    not_used = set(i for i in range(prod))

    while len(masks) < n:
        mask = np.ones(prod)
        # choose from the aviable indices
        idxs = np.random.choice(tuple(not_used), n_mask, replace=False)
        # set to 0
        mask[idxs] = 0
        # mark as used
        not_used = not_used.difference(idxs)
        # reshape to the input shape
        mask = torch.tensor(mask).reshape(shape)
        masks.append(mask)
    
    masks = torch.stack(masks)
    assert ((masks.size(0) - masks.sum(0)) <= 1).all(), "Something wrong"
    return masks

In [76]:
%%timeit
s = set(range(100))
c = list(range(0, 100, 3))

for _ in range(100):
    a = s.difference(c)

218 µs ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [81]:
%%timeit
s = set(range(100))
c = list(range(0, 100, 3))

for _ in range(100):
    s2 = s.copy()
    for i in c:
        s2.remove(i)

526 µs ± 125 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [82]:
%%timeit
s = list(range(100))
c = list(range(0, 100, 3))

for _ in range(100):
    s2 = s.copy()
    for i in c:
        s2.remove(i)

2.3 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
masks = generate_non_overlapping_masks((1024, 416), 20)

In [67]:
masks.size()

torch.Size([20, 1024, 416])

In [83]:
tensor.shape

torch.Size([13195, 460])

In [87]:
res = tensor.unfold(0, 1024, 512).permute(0, 2, 1)
print(res.shape)

torch.Size([24, 1024, 460])


In [111]:
masks = generate_mask_V1(tensor.shape, 2000)
# first_mask = masks[0]
# print indices of holes
# print(torch.logical_not(first_mask).nonzero())

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 11.4 µs
0 3078


In [115]:
mask = generate_mask_V2(tensor.shape, 2000)
print(np.count_nonzero(mask == 0))
print(np.count_nonzero(mask == 0) + np.count_nonzero(mask) == np.prod(mask.shape))
# print indices of holes
# print(torch.logical_not(mask).nonzero())

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 15.7 µs
tensor([1., 1., 1.,  ..., 1., 1., 1.])
tensor([0., 0., 0.,  ..., 1., 1., 1.])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])
3078
True


In [4]:
from deep_fib.data import unfolded_indexes

idxs = unfolded_indexes(dataset, 1024, 512)
for _, (s, e) in idxs:
    assert e - s == 1024
len(idxs)

79

In [5]:
idxs0 = list(filter(lambda d: d[0] == 0, idxs))
print(dataset[0][0].shape)
idxs0

(13195, 460)


[(0, (0, 1024)),
 (0, (512, 1536)),
 (0, (1024, 2048)),
 (0, (1536, 2560)),
 (0, (2048, 3072)),
 (0, (2560, 3584)),
 (0, (3072, 4096)),
 (0, (3584, 4608)),
 (0, (4096, 5120)),
 (0, (4608, 5632)),
 (0, (5120, 6144)),
 (0, (5632, 6656)),
 (0, (6144, 7168)),
 (0, (6656, 7680)),
 (0, (7168, 8192)),
 (0, (7680, 8704)),
 (0, (8192, 9216)),
 (0, (8704, 9728)),
 (0, (9216, 10240)),
 (0, (9728, 10752)),
 (0, (10240, 11264)),
 (0, (10752, 11776)),
 (0, (11264, 12288)),
 (0, (11776, 12800)),
 (0, (12170, 13194))]

In [30]:
masks = np.random.rand(20, 5, 5)

random_sample = random.choices(masks, k = 32)
print(len(random_sample))

32


In [41]:
inputs = [np.ones((5, 5)) for i in range(0, 32)]
len(inputs)

32

In [42]:
print(inputs[0])

[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]


In [43]:
inputs = np.array(inputs)
print(inputs.shape)

(32, 5, 5)


In [44]:
print(len(inputs))

32


In [50]:
MASK = -1
masked_inputs = []
for (mask, elem) in zip(random_sample, inputs):
    masked_input = elem.copy()
    masked_input[mask > 0.5] = MASK
    masked_inputs.append(masked_input)
    
print(len(masked_inputs))
print(masked_inputs[0])

32
[[ 1.  1.  1.  1. -1.]
 [-1.  1.  1. -1.  1.]
 [-1.  1. -1. -1. -1.]
 [-1.  1.  1.  1.  1.]
 [ 1. -1.  1.  1. -1.]]
