In [4]:
!python ptls_run.py --group coles --splitter slices

[34m[1mwandb[0m: Currently logged in as: [33mvasilev-va[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.13.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/jovyan/v_vasilev/romashka/wandb/run-20230208_221108-dqvs6bqu[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mptls-mlm-rnn-splits=5-seqlen=200-bs=64[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vasilev-va/romashka[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vasilev-va/romashka/runs/dqvs6bqu[0m
Splitter is None.
The mode is MLM. Param hidden_size was assigned to output_size of trx_encoder: 182
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK

In [51]:
%load_ext autoreload
%autoreload 2

import os
import torch 
import numpy as np
import pickle
from torch.utils.data import IterableDataset, DataLoader
from models import TransactionsModel
from data_generators import batches_generator, cat_features_names, num_features_names, meta_features_names

from embedding import EmbeddingLayer
from ptls.frames import PtlsDataModule
from ptls.frames.bert import MLMPretrainModule

from ptls.nn import RnnSeqEncoder
from ptls.nn import TransformerEncoder
from functools import partial
from collections import namedtuple

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
with open('./assets/num_embedding_projections.pkl', 'rb') as f:
    num_embedding_projections = pickle.load(f)
    
with open('./assets/cat_embedding_projections.pkl', 'rb') as f:
    cat_embedding_projections = pickle.load(f)

with open('./assets/meta_embedding_projections.pkl', 'rb') as f:
    meta_embedding_projections = pickle.load(f)

In [53]:
class PaddedBatch:
    def __init__(self, data, mask):
        self.payload = data
        self.seq_lens = torch.LongTensor([data.shape[1]] * data.shape[0]).to(device)
        self.seq_len_mask = mask
        
class IterDataset(IterableDataset):
    def __init__(self, dataset_train, batch_size=64, device='cuda'):
        self.data = dataset_train
        self.batch_size = batch_size
        self.device = device
        self.foo = lambda: batches_generator(self.data, batch_size=self.batch_size, shuffle=True, device=self.device, is_train=True, output_format='torch', min_seq_len=200)

    def __iter__(self):
        return self.foo()

In [54]:
path_to_dataset = '/home/jovyan/afilatov/data/alfa/train_buckets'

dir_with_datasets = os.listdir(path_to_dataset)
dataset_train = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])[0:1]

#train_dataloader = batches_generator(dataset_train, batch_size=64, shuffle=True,
#                                            device=device, is_train=True, output_format='torch', min_seq_len=200)

In [55]:
dataset = IterDataset(dataset_train)

In [56]:
class PtlsEmbeddingLayer(EmbeddingLayer):
    def __init__(self, splitter, *args, **kwargs):
        self.splitter = splitter
        super().__init__(*args, **kwargs)
        self.output_size = self.get_embedding_size()

    def forward(self, x):
        mask = x['mask']
        x = super().forward(x)
        return PaddedBatch(x, mask)

In [57]:
class MySampleUniform:
    """
    Sub samples with equal length = `seq_len`
    Start pos has fixed uniform distribution from sequence start to end with equal step
    |---------------------|       main sequence
    |------|              |        sub seq 1
    |    |------|         |        sub seq 2
    |         |------|    |        sub seq 3
    |              |------|        sub seq 4
    There is no random factor in this splitter, so sub sequences are the same every time
    Can be used during inference as test time augmentation
    """
    def __init__(self, split_count, seq_len, **_):
        self.split_count = split_count
        self.seq_len = seq_len

    def split(self, dates):
        date_len = dates.shape[0]
        date_range = np.arange(date_len)

        if date_len <= self.seq_len + self.split_count:
            return [date_range for _ in range(self.split_count)]

        start_pos = np.linspace(0, date_len - self.seq_len, self.split_count).round().astype(int)
        return [date_range[s:s + self.seq_len] for s in start_pos]

In [58]:
batch = next(iter(dataset))

In [59]:
batch.keys()

dict_keys(['num_features', 'cat_features', 'mask', 'event_time', 'meta_features', 'label', 'app_id'])

In [60]:
from copy import deepcopy

def split_process(batch, splitter):
    res = {}
    
    local_date = batch['event_time']
    if splitter is not None:
        indexes = splitter.split(local_date)
        pad_size = max([len(ixs) for ixs in indexes])
    
    for k, v in batch.items():
        if type(v) == list and len(v) > 1 and splitter is not None:
            new_v = []
            for elem in v:
                tmp = []
                for i, ixs in enumerate(indexes):
                    to_tmp = elem[:, ixs]
                    if to_tmp.shape[1] < pad_size:
                        to_tmp = torch.cat([
                            to_tmp, torch.zeros(to_tmp.shape[0], pad_size - to_tmp.shape[1]).to(device)
                        ], axis=1)
                    tmp.append(to_tmp)
                new_v.append(torch.cat(tmp, dim=0))
        else:
            new_v = v 
        res[k] = new_v
    return res

def replace_token(batch, replace_prob=0.15, skip_first=1):
    mask = batch['mask']
    to_replace = torch.bernoulli(mask * replace_prob).bool()
    to_replace[:, :skip_first] = False

    sampled_trx_ids = torch.multinomial(
        mask.flatten().float(),
        num_samples=to_replace.sum().item(),
        replacement=True,
    )

    to_replace_flatten = to_replace.flatten()
    new_x = deepcopy(batch)
    for k, v in new_x.items():
        if type(v) == list and len(v) > 1:
            for elem in v:
                elem.flatten()[to_replace_flatten] = elem.flatten()[sampled_trx_ids]
    return new_x, to_replace.long().flatten()#[mask.flatten().bool()]


def my_collate_fn(batch, splitter, rep=5, mode='coles'):
    batch = batch[0]
    len_batch = batch['num_features'][0].shape[0]
    labels = torch.arange(len_batch).repeat(rep)
    batch = split_process(batch, splitter)
    
    if mode == 'coles':
        return batch, labels
    
    if mode == 'cpc':
        return batch, None
    
    if mode == 'rtd':
        batch, labels = replace_token(batch)
        return batch, labels
        
    if mode == 'mlm':
        return batch

### COLES

In [61]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

from ptls.frames.coles import CoLESModule
from ptls.data_load.utils import collate_feature_dict

In [62]:
coles_splitter = MySampleUniform(
        split_count=5,
        seq_len=100
    )
coles_splitter = SampleSlices(split_count=5, cnt_min=50, cnt_max=100)

coles_ptls_emb_layer = PtlsEmbeddingLayer(coles_splitter,
                                    cat_embedding_projections,
                                    cat_features_names,
                                    num_embedding_projections,
                                    num_features_names).cuda()

coles_seq_encoder = RnnSeqEncoder(
    input_size=coles_ptls_emb_layer.get_embedding_size(),
    trx_encoder=coles_ptls_emb_layer,
    hidden_size=256,
    type='gru',
)

coles_model = CoLESModule(
    seq_encoder=coles_seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
).cuda()

In [63]:
coles_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=coles_splitter, rep=5, mode='coles'),
    num_workers=0,
    batch_size=1
)

In [64]:
import pytorch_lightning as pl
import logging

coles_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [65]:
# %debug
coles_trainer.fit(coles_model, coles_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 352 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
352 K     Trainable params
0         Non-trainable params
352 K     Total params
1.409     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

### CPC

In [22]:
from ptls.frames.cpc import CpcModule
from ptls.frames.coles.split_strategy import SampleSlices, SampleUniformBySplitCount

In [23]:
cpc_splitter = SampleUniformBySplitCount(split_count=5) # splitter should preserve order in samples
cpc_splitter = SampleSlices(split_count=5, cnt_min=50, cnt_max=100, is_sorted=True)

cpc_ptls_emb_layer = PtlsEmbeddingLayer(cpc_splitter,
                                        cat_embedding_projections,
                                        cat_features_names,
                                        num_embedding_projections,
                                        num_features_names).cuda()

cpc_seq_encoder = RnnSeqEncoder(
    input_size=cpc_ptls_emb_layer.get_embedding_size(),
    trx_encoder=cpc_ptls_emb_layer,
    hidden_size=256,
    type='gru',
)

cpc_model = CpcModule(
    seq_encoder=cpc_seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9)
).cuda()

In [24]:
cpc_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=cpc_splitter, mode='cpc'),
    num_workers=0,
    batch_size=1
)

In [25]:
import pytorch_lightning as pl

cpc_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [26]:
#%debug
cpc_trainer.fit(cpc_model, cpc_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type          | Params
-----------------------------------------------------
0 | _loss              | CPC_Loss      | 0     
1 | _seq_encoder       | RnnSeqEncoder | 352 K 
2 | _validation_metric | CpcAccuracy   | 0     
3 | _linears           | ModuleList    | 280 K 
-----------------------------------------------------
632 K     Trainable params
0         Non-trainable params
632 K     Total params
2.532     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

### RTD

In [43]:
import torchmetrics
from ptls.frames.bert import RtdModule
from ptls.nn.seq_encoder.utils import AllStepsHead, FlattenHead
from ptls.frames.coles.split_strategy import SampleUniformBySplitCount

In [44]:
rtd_splitter = SampleUniformBySplitCount(split_count=5)
rtd_ptls_emb_layer = PtlsEmbeddingLayer(rtd_splitter,
                                        cat_embedding_projections,
                                        cat_features_names,
                                        num_embedding_projections,
                                        num_features_names).cuda()

rtd_seq_encoder = RnnSeqEncoder(
    input_size=rtd_ptls_emb_layer.get_embedding_size(),
    trx_encoder=rtd_ptls_emb_layer,
    hidden_size=256,
    type='gru',
).cuda()

rtd_model = RtdModule(
    seq_encoder=rtd_seq_encoder,
    validation_metric=torchmetrics.AUROC(task='binary'),
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
    head = torch.nn.Sequential(
        AllStepsHead(
            torch.nn.Sequential(
                torch.nn.Linear(256, 1),
                torch.nn.Sigmoid(),
                torch.nn.Flatten(),
            )
        ),
        FlattenHead(),
    )
).cuda()

In [45]:
rtd_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=None, mode='rtd', rep=1),
    num_workers=0,
    batch_size=1
)

In [46]:
import pytorch_lightning as pl

rtd_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [47]:
#%debug
rtd_trainer.fit(rtd_model, rtd_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type          | Params
-----------------------------------------------------
0 | _loss              | BCELoss       | 0     
1 | _seq_encoder       | RnnSeqEncoder | 352 K 
2 | _validation_metric | BinaryAUROC   | 0     
3 | _head              | Sequential    | 257   
-----------------------------------------------------
352 K     Trainable params
0         Non-trainable params
352 K     Total params
1.410     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

### MLM

In [48]:
from ptls.frames.bert import MLMPretrainModule
from ptls.nn import RnnEncoder

In [49]:
mlm_ptls_emb_layer = PtlsEmbeddingLayer(None,
                                        cat_embedding_projections,
                                        cat_features_names,output_size
                                        num_embedding_projections,
                                        num_features_names).cuda()

mlm_seq_encoder = RnnEncoder(
    #trx_encoder=mlm_ptls_emb_layer,
    input_size=mlm_ptls_emb_layer.get_embedding_size(),
    is_reduce_sequence=False,
    hidden_size=182,
    type='gru',
).cuda()

mlm_model = MLMPretrainModule(
    trx_encoder=mlm_ptls_emb_layer, 
    seq_encoder=mlm_seq_encoder,
    total_steps=10000
).cuda()

In [34]:
mlm_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=None, rep=1, mode='mlm'),
    num_workers=0,
    batch_size=1
)

In [35]:
import pytorch_lightning as pl

mlm_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [36]:
mlm_trainer.fit(mlm_model, mlm_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type               | Params
-------------------------------------------------------
0 | trx_encoder     | PtlsEmbeddingLayer | 14.1 K
1 | _seq_encoder    | RnnEncoder         | 200 K 
2 | fn_norm_predict | PBShell            | 0     
3 | loss_fn         | QuerySoftmaxLoss   | 0     
4 | train_mlm_loss  | MeanMetric         | 0     
5 | valid_mlm_loss  | MeanMetric         | 0     
-------------------------------------------------------
214 K     Trainable params
0         Non-trainable params
214 K     Total params
0.857     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]