In [1]:
%load_ext autoreload
%autoreload 2

import os
import torch 
import numpy as np
import pickle
from torch.utils.data import IterableDataset, DataLoader
from models import TransactionsModel
from data_generators import batches_generator, cat_features_names, num_features_names, meta_features_names

from embedding import EmbeddingLayer
from ptls.frames import PtlsDataModule
from ptls.frames.bert import MLMPretrainModule

from ptls.nn import RnnSeqEncoder
from ptls.nn import TransformerEncoder
from functools import partial
from collections import namedtuple

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
with open('./assets/num_embedding_projections.pkl', 'rb') as f:
    num_embedding_projections = pickle.load(f)
    
with open('./assets/cat_embedding_projections.pkl', 'rb') as f:
    cat_embedding_projections = pickle.load(f)

with open('./assets/meta_embedding_projections.pkl', 'rb') as f:
    meta_embedding_projections = pickle.load(f)

In [3]:
class PaddedBatch:
    def __init__(self, data):
        self.payload = data
        self.seq_lens = torch.LongTensor([data.shape[1]] * data.shape[0]).to(device)
        
class IterDataset(IterableDataset):
    def __init__(self, dataset_train, batch_size=64, device='cuda'):
        self.data = dataset_train
        self.batch_size = batch_size
        self.device = device
        self.foo = lambda: batches_generator(self.data, batch_size=self.batch_size, shuffle=True, device=self.device, is_train=True, output_format='torch', min_seq_len=200)

    def __iter__(self):
        return self.foo()

In [4]:
path_to_dataset = '/home/jovyan/afilatov/data/alfa/train_buckets'

dir_with_datasets = os.listdir(path_to_dataset)
dataset_train = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])[0:1]

#train_dataloader = batches_generator(dataset_train, batch_size=64, shuffle=True,
#                                            device=device, is_train=True, output_format='torch', min_seq_len=200)

In [5]:
dataset = IterDataset(dataset_train)

In [6]:
class PtlsEmbeddingLayer(EmbeddingLayer):
    def __init__(self, splitter, *args, **kwargs):
        self.splitter = splitter
        super().__init__(*args, **kwargs)
        self.output_size = self.get_embedding_size()

    def forward(self, x):
        x = super().forward(x)
        return PaddedBatch(x)

In [7]:
class MySampleUniform:
    """
    Sub samples with equal length = `seq_len`
    Start pos has fixed uniform distribution from sequence start to end with equal step
    |---------------------|       main sequence
    |------|              |        sub seq 1
    |    |------|         |        sub seq 2
    |         |------|    |        sub seq 3
    |              |------|        sub seq 4
    There is no random factor in this splitter, so sub sequences are the same every time
    Can be used during inference as test time augmentation
    """
    def __init__(self, split_count, seq_len, **_):
        self.split_count = split_count
        self.seq_len = seq_len

    def split(self, dates):
        date_len = dates.shape[0]
        date_range = np.arange(date_len)

        if date_len <= self.seq_len + self.split_count:
            return [date_range for _ in range(self.split_count)]

        start_pos = np.linspace(0, date_len - self.seq_len, self.split_count).round().astype(int)
        return [date_range[s:s + self.seq_len] for s in start_pos]

In [8]:
def split_process(batch, splitter):
    res = {}
    
    local_date = batch['event_time']
    indexes = splitter.split(local_date)
    
    for k, v in batch.items():
        if type(v) == list and len(v) > 1:
            new_v = []
            for elem in v:
                tmp = []
                for i, ix in enumerate(indexes):
                    to_tmp = elem[:, ix]
                    if i == 0:
                        pad_size = to_tmp.shape[1]
                    else:
                        if to_tmp.shape[1] < pad_size:
                            to_tmp = torch.cat([
                                to_tmp, torch.zeros(to_tmp.shape[0], pad_size - to_tmp.shape[1]).to(device)
                            ], axis=1)
                    tmp.append(to_tmp)
                new_v.append(torch.cat(tmp, dim=0))
        else:
            new_v = v 
        res[k] = new_v
    return res

def my_collate_fn(batch, splitter):
    batch = batch[0]
    len_batch = batch['num_features'][0].shape[0]
    batch = split_process(batch, splitter)
    # print(batch)
    labels = torch.arange(len_batch).repeat(5)
    return batch, labels

### COLES

In [9]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

from ptls.frames.coles import CoLESModule
from ptls.data_load.utils import collate_feature_dict

In [11]:
coles_splitter = MySampleUniform(
        split_count=5,
        seq_len=100
    )

ptls_emb_layer = PtlsEmbeddingLayer(coles_splitter,
                                    cat_embedding_projections,
                                    cat_features_names,
                                    num_embedding_projections,
                                    num_features_names).cuda()
seq_encoder = RnnSeqEncoder(
    input_size=ptls_emb_layer.get_embedding_size(),
    trx_encoder=ptls_emb_layer,
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
).cuda()

In [12]:
coles_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=coles_splitter),
    num_workers=0,
    batch_size=1
)

In [13]:
import pytorch_lightning as pl
import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
# %debug
trainer.fit(model, coles_dataloader)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 352 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
352 K     Trainable params
0         Non-trainable params
352 K     Total params
1.409     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


### CPC

In [15]:
from ptls.frames.cpc import CpcModule
from ptls.frames.coles.split_strategy import SampleSlices, SampleUniformBySplitCount

In [16]:
cpc_splitter = SampleUniformBySplitCount(split_count=5) # splitter should preserve order in samples

cpc_ptls_emb_layer = PtlsEmbeddingLayer(cpc_splitter,
                                        cat_embedding_projections,
                                        cat_features_names,
                                        num_embedding_projections,
                                        num_features_names).cuda()

cpc_seq_encoder = RnnSeqEncoder(
    input_size=cpc_ptls_emb_layer.get_embedding_size(),
    trx_encoder=cpc_ptls_emb_layer,
    hidden_size=256,
    type='gru',
)

cpc_model = CpcModule(
    seq_encoder=cpc_seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9)
).cuda()

In [17]:
cpc_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=cpc_splitter),
    num_workers=0,
    batch_size=1
)

In [18]:
import pytorch_lightning as pl

cpc_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
#%debug
cpc_trainer.fit(cpc_model, cpc_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type          | Params
-----------------------------------------------------
0 | _loss              | CPC_Loss      | 0     
1 | _seq_encoder       | RnnSeqEncoder | 352 K 
2 | _validation_metric | CpcAccuracy   | 0     
3 | _linears           | ModuleList    | 280 K 
-----------------------------------------------------
632 K     Trainable params
0         Non-trainable params
632 K     Total params
2.532     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]