In [24]:
#!python ptls_run.py --group coles --splitter slices --encoder gpt --lr 0.00001

In [24]:
#!python ptls_run.py --group coles --splitter slices --encoder whisper/small --lr 0.00001 --pretrained

In [24]:
#!python ptls_run.py --group coles --splitter slices --encoder lstm --lr 0.00001 --num_layers 1

In [1]:
%load_ext autoreload
%autoreload 2

import os
import torch 
import numpy as np
import pickle
from torch.utils.data import IterableDataset, DataLoader
from models import TransactionsModel
from data_generators import batches_generator, cat_features_names, num_features_names, meta_features_names

from embedding import EmbeddingLayer
from ptls.frames import PtlsDataModule
from ptls.frames.bert import MLMPretrainModule

from ptls.nn import RnnSeqEncoder
from ptls.nn import TransformerEncoder
from functools import partial
from collections import namedtuple

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
with open('./assets/num_embedding_projections.pkl', 'rb') as f:
    num_embedding_projections = pickle.load(f)
    
with open('./assets/cat_embedding_projections.pkl', 'rb') as f:
    cat_embedding_projections = pickle.load(f)

with open('./assets/meta_embedding_projections.pkl', 'rb') as f:
    meta_embedding_projections = pickle.load(f)

In [3]:
class MyPaddedBatch:
    def __init__(self, data, mask):
        self.payload = data
        self.seq_lens = torch.LongTensor([data.shape[1]] * data.shape[0]).to(device)
        self.seq_len_mask = mask
        
class IterDataset(IterableDataset):
    def __init__(self, dataset_train, batch_size=64, device='cuda'):
        self.data = dataset_train
        self.batch_size = batch_size
        self.device = device
        self.foo = lambda: batches_generator(self.data, batch_size=self.batch_size, shuffle=True, device=self.device, is_train=True, output_format='torch', min_seq_len=200)

    def __iter__(self):
        return self.foo()

In [4]:
path_to_dataset = '/home/jovyan/afilatov/data/alfa/train_buckets'

dir_with_datasets = os.listdir(path_to_dataset)
dataset_train = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])[0:1]

#train_dataloader = batches_generator(dataset_train, batch_size=64, shuffle=True,
#                                            device=device, is_train=True, output_format='torch', min_seq_len=200)

In [5]:
dataset = IterDataset(dataset_train)

In [6]:
class PtlsEmbeddingLayer(EmbeddingLayer):
    def __init__(self, splitter=None, *args, **kwargs):
        #self.splitter = splitter
        super().__init__(*args, **kwargs)
        self.output_size = self.get_embedding_size()

    def forward(self, x):
        mask = x['mask']
        x = super().forward(x)
        return MyPaddedBatch(x, mask)

In [7]:
class MySampleUniform:
    """
    Sub samples with equal length = `seq_len`
    Start pos has fixed uniform distribution from sequence start to end with equal step
    |---------------------|       main sequence
    |------|              |        sub seq 1
    |    |------|         |        sub seq 2
    |         |------|    |        sub seq 3
    |              |------|        sub seq 4
    There is no random factor in this splitter, so sub sequences are the same every time
    Can be used during inference as test time augmentation
    """
    def __init__(self, split_count, seq_len, **_):
        self.split_count = split_count
        self.seq_len = seq_len

    def split(self, dates):
        date_len = dates.shape[0]
        date_range = np.arange(date_len)

        if date_len <= self.seq_len + self.split_count:
            return [date_range for _ in range(self.split_count)]

        start_pos = np.linspace(0, date_len - self.seq_len, self.split_count).round().astype(int)
        return [date_range[s:s + self.seq_len] for s in start_pos]

In [8]:
batch = next(iter(dataset))

In [9]:
batch.keys()

dict_keys(['num_features', 'cat_features', 'mask', 'event_time', 'meta_features', 'label', 'app_id'])

In [10]:
from copy import deepcopy

def split_process(batch, splitter):
    res = {}
    
    local_date = batch['event_time']
    if splitter is not None:
        indexes = splitter.split(local_date)
        pad_size = max([len(ixs) for ixs in indexes])
    
    for k, v in batch.items():
        if type(v) == list and len(v) > 1 and splitter is not None:
            new_v = []
            for elem in v:
                tmp = []
                for i, ixs in enumerate(indexes):
                    to_tmp = elem[:, ixs]
                    if to_tmp.shape[1] < pad_size:
                        to_tmp = torch.cat([
                            to_tmp, torch.zeros(to_tmp.shape[0], pad_size - to_tmp.shape[1]).to(device)
                        ], axis=1)
                    tmp.append(to_tmp)
                new_v.append(torch.cat(tmp, dim=0))
        else:
            new_v = v 
        res[k] = new_v
    res['mask'] = res['cat_features'][0] != 0
    return res

def replace_token(batch, replace_prob=0.15, skip_first=1):
    mask = batch['mask']
    to_replace = torch.bernoulli(mask * replace_prob).bool()
    to_replace[:, :skip_first] = False

    sampled_trx_ids = torch.multinomial(
        mask.flatten().float(),
        num_samples=to_replace.sum().item(),
        replacement=True,
    )

    to_replace_flatten = to_replace.flatten()
    new_x = deepcopy(batch)
    for k, v in new_x.items():
        if type(v) == list and len(v) > 1:
            for elem in v:
                elem.flatten()[to_replace_flatten] = elem.flatten()[sampled_trx_ids]
    return new_x, to_replace.long().flatten()#[mask.flatten().bool()]


def my_collate_fn(batch, splitter, rep=5, mode='coles'):
    batch = batch[0]
    len_batch = batch['num_features'][0].shape[0]
    labels = torch.arange(len_batch).repeat(rep)
    batch = split_process(batch, splitter)
    
    if mode == 'coles':
        return batch, labels
    
    if mode == 'cpc':
        return batch, None
    
    if mode == 'rtd':
        batch, labels = replace_token(batch)
        return batch, labels
        
    if mode == 'mlm':
        return batch

In [11]:
from transformers import GPT2Config, GPT2Model, BertConfig, BertModel, T5Config, T5Model
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn.seq_encoder.containers import SeqEncoderContainer
from transformers import AutoModel, AutoConfig
from tools import LambdaLayer
from tools import calculate_embedding_size
from embedding import LinearMapping

class MyEncoder(AbsSeqEncoder):
    def __init__(self,
                 input_size=None,
                 is_reduce_sequence=False,
                 encoder_type='gpt',
                 num_layers=2,
                 num_heads=1,
                 pretrained=True
                ):
    
        super().__init__(is_reduce_sequence=is_reduce_sequence)
        self.encoder_type = encoder_type
        print(f'Sequential Encoder is {self.encoder_type}')
        #print(f'Num layers is {num_layers}')
        #print(f'Num heads is {num_heads}')
        
        if self.encoder_type == 'gpt':
            configuration = GPT2Config(n_positions=2048,
                                       n_embd=input_size, n_layer=num_layers,
                                       n_head=num_heads, resid_pdrop=0.1,
                                       embd_pdrop=0.1, attn_pdrop=0.1)
            
            self.encoder = GPT2Model(configuration)
        elif self.encoder_type == 'bert':
            configuration = BertConfig(hidden_size=input_size,
                                       num_hidden_layers=num_layers, num_attention_heads=num_heads,
                                       intermediate_size=512, hidden_dropout_prob=0.1,
                                       attention_probs_dropout_prob=0.1,
                                       max_position_embeddings=2048)
            
            self.encoder = BertModel(configuration)
        elif self.encoder_type == 't5':
            configuration = T5Config(d_model=input_size,
                                     d_kv=input_size // 1, d_ff=512,
                                     num_layers=num_layers, num_heads=num_heads)
            
            self.encoder = T5Model(configuration)
        
        
        elif self.encoder_type == 'whisper/small':
            config_name = 'openai/whisper-small'
            encoder_type, encoder_size = self.encoder_type.split('/')
            
            if pretrained:
                model = AutoModel.from_pretrained(config_name)
            else:
                config = AutoConfig.from_pretrained(config_name)
                model  = AutoModel.from_config(config)
                
            self.encoder = model.decoder
            self.encoder.embed_positions = LambdaLayer(lambda x: 0)
            
            hidden_size = calculate_embedding_size(model)
            self.mapping_embedding = LinearMapping(input_size, hidden_size)
        
        self.hidden_size = input_size
        self.input_size = input_size
        
    def forward(self, x, mask):
        """
        :param x:
        :param h_0: None or [1, B, H] float tensor
                    0.0 values in all components of hidden state of specific client means no-previous state and
                    use starter for this client
                    h_0 = None means no-previous state for all clients in batch
        :return:
        """
        #shape = x.payload.size()
        if self.encoder_type == 'whisper/small':
            embedding = self.mapping_embedding(x.payload, attention_mask=mask)
            out = self.encoder(inputs_embeds=embedding, attention_mask=mask).last_hidden_state[:, -1]
            
        elif self.encoder_type == 't5':
            out = self.encoder(inputs_embeds=x.payload,
                               decoder_inputs_embeds=x.payload,
                               attention_mask=mask).last_hidden_state#[:, -1]
        else:
            out = self.encoder(inputs_embeds=x.payload,
                               attention_mask=mask).last_hidden_state#[:, -1]
                
        return out

    @property
    def embedding_size(self):
        return self.hidden_size

    
class MySeqEncoder(SeqEncoderContainer):
    def __init__(self,
                 trx_encoder=None,
                 input_size=None,
                 is_reduce_sequence=False,
                 **seq_encoder_params,
                 ):
        super().__init__(
            trx_encoder=trx_encoder,
            seq_encoder_cls=MyEncoder,
            input_size=input_size,
            seq_encoder_params=seq_encoder_params,
            is_reduce_sequence=is_reduce_sequence,
        )

    def forward(self, x, h_0=None):
        mask = x['mask']
        x = self.trx_encoder(x)
        x = self.seq_encoder(x, mask)
        return x

### COLES-GPT

In [12]:
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.coles import CoLESModule

coles_splitter = SampleSlices(split_count=5, cnt_min=50, cnt_max=100)

coles_ptls_emb_layer = PtlsEmbeddingLayer(coles_splitter,
                                    cat_embedding_projections,
                                    cat_features_names,
                                    num_embedding_projections,
                                    num_features_names).cuda()

gpt_coles_seq_encoder = MySeqEncoder(trx_encoder=coles_ptls_emb_layer, 
                                     input_size=coles_ptls_emb_layer.get_embedding_size(),
                                    encoder_type='whisper/small',
                                    pretrained=True)

Sequential Encoder is whisper/small


In [13]:
calculate_embedding_size(gpt_coles_seq_encoder)

768

In [14]:
gpt_coles_model = CoLESModule(
    seq_encoder=gpt_coles_seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
).cuda()

coles_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=coles_splitter, rep=5, mode='coles'),
    num_workers=0,
    batch_size=1
)

In [24]:
#gpt_coles_model._seq_encoder.trx_encoder

In [27]:
from tools import set_seeds, count_parameters

count_parameters(gpt_coles_model._seq_encoder.seq_encoder) # whisper

153377280

In [68]:
count_parameters(gpt_coles_model) # t5

3099270

In [92]:
count_parameters(gpt_coles_model) # bert

1705132

In [19]:
import pytorch_lightning as pl
import logging

coles_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

coles_trainer.fit(gpt_coles_model, coles_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | MySeqEncoder    | 153 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
153 M     Trainable params
0         Non-trainable params
153 M     Total params
613.566   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

x <__main__.MyPaddedBatch object at 0x7f6231787cd0>
x <__main__.MyPaddedBatch object at 0x7f6231787090>
x <__main__.MyPaddedBatch object at 0x7f6231787750>
x <__main__.MyPaddedBatch object at 0x7f6231787b50>
x <__main__.MyPaddedBatch object at 0x7f6231787ed0>
x <__main__.MyPaddedBatch object at 0x7f6231787090>
x <__main__.MyPaddedBatch object at 0x7f6231787e10>
x <__main__.MyPaddedBatch object at 0x7f6231787f50>
x <__main__.MyPaddedBatch object at 0x7f6231787b50>
x <__main__.MyPaddedBatch object at 0x7f62317875d0>
x <__main__.MyPaddedBatch object at 0x7f6231787f10>
x <__main__.MyPaddedBatch object at 0x7f6231787750>
x <__main__.MyPaddedBatch object at 0x7f6231787bd0>
x <__main__.MyPaddedBatch object at 0x7f6231787d90>
x <__main__.MyPaddedBatch object at 0x7f6231787e10>
x <__main__.MyPaddedBatch object at 0x7f62317875d0>
x <__main__.MyPaddedBatch object at 0x7f6231787750>
x <__main__.MyPaddedBatch object at 0x7f6231787c10>
x <__main__.MyPaddedBatch object at 0x7f6231787e10>
x <__main__.

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [83]:
from tools import set_seeds, count_parameters

count_parameters(gpt_coles_model) # gpt

1978086

### COLES

In [29]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

from ptls.frames.coles import CoLESModule
from ptls.data_load.utils import collate_feature_dict

In [32]:
coles_splitter = MySampleUniform(
        split_count=5,
        seq_len=100
    )
coles_splitter = SampleSlices(split_count=5, cnt_min=50, cnt_max=100)

coles_ptls_emb_layer = PtlsEmbeddingLayer(None,
                                    cat_embedding_projections,
                                    cat_features_names,
                                    num_embedding_projections,
                                    num_features_names).cuda()

coles_seq_encoder = RnnSeqEncoder(
    input_size=coles_ptls_emb_layer.get_embedding_size(),
    trx_encoder=coles_ptls_emb_layer,
    hidden_size=256,
    num_layers=4,
    type='lstm',
)

coles_model = CoLESModule(
    seq_encoder=coles_seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
).cuda()

In [33]:
from tools import count_parameters
count_parameters(coles_model) # gru

2043948

In [20]:
coles_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=coles_splitter, rep=5, mode='coles'),
    num_workers=0,
    batch_size=1
)

In [21]:
import pytorch_lightning as pl
import logging

coles_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
# %debug
coles_trainer.fit(coles_model, coles_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 747 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
747 K     Trainable params
0         Non-trainable params
747 K     Total params
2.988     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

RuntimeError: Expected hidden size (2, 320, 256), got [1, 320, 256]

### CPC

In [34]:
from ptls.frames.cpc import CpcModule
from ptls.frames.coles.split_strategy import SampleSlices, SampleUniformBySplitCount

In [35]:
cpc_splitter = SampleUniformBySplitCount(split_count=5) # splitter should preserve order in samples
cpc_splitter = SampleSlices(split_count=5, cnt_min=50, cnt_max=100, is_sorted=True)

cpc_ptls_emb_layer = PtlsEmbeddingLayer(cpc_splitter,
                                        cat_embedding_projections,
                                        cat_features_names,
                                        num_embedding_projections,
                                        num_features_names).cuda()

cpc_seq_encoder = RnnSeqEncoder(
    input_size=cpc_ptls_emb_layer.get_embedding_size(),
    trx_encoder=cpc_ptls_emb_layer,
    hidden_size=256,
    type='gru',
)

cpc_model = CpcModule(
    seq_encoder=cpc_seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9)
).cuda()

In [36]:
count_parameters(cpc_model)

632944

In [24]:
cpc_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=cpc_splitter, mode='cpc'),
    num_workers=0,
    batch_size=1
)

In [25]:
import pytorch_lightning as pl

cpc_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [26]:
#%debug
cpc_trainer.fit(cpc_model, cpc_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type          | Params
-----------------------------------------------------
0 | _loss              | CPC_Loss      | 0     
1 | _seq_encoder       | RnnSeqEncoder | 352 K 
2 | _validation_metric | CpcAccuracy   | 0     
3 | _linears           | ModuleList    | 280 K 
-----------------------------------------------------
632 K     Trainable params
0         Non-trainable params
632 K     Total params
2.532     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

### RTD

In [37]:
import torchmetrics
from ptls.frames.bert import RtdModule
from ptls.nn.seq_encoder.utils import AllStepsHead, FlattenHead
from ptls.frames.coles.split_strategy import SampleUniformBySplitCount

In [38]:
rtd_splitter = SampleUniformBySplitCount(split_count=5)
rtd_ptls_emb_layer = PtlsEmbeddingLayer(rtd_splitter,
                                        cat_embedding_projections,
                                        cat_features_names,
                                        num_embedding_projections,
                                        num_features_names).cuda()

rtd_seq_encoder = RnnSeqEncoder(
    input_size=rtd_ptls_emb_layer.get_embedding_size(),
    trx_encoder=rtd_ptls_emb_layer,
    hidden_size=256,
    type='gru',
).cuda()

rtd_model = RtdModule(
    seq_encoder=rtd_seq_encoder,
    validation_metric=torchmetrics.AUROC(task='binary'),
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
    head = torch.nn.Sequential(
        AllStepsHead(
            torch.nn.Sequential(
                torch.nn.Linear(256, 1),
                torch.nn.Sigmoid(),
                torch.nn.Flatten(),
            )
        ),
        FlattenHead(),
    )
).cuda()

In [40]:
count_parameters(rtd_model)

352557

In [39]:
rtd_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=None, mode='rtd', rep=1),
    num_workers=0,
    batch_size=1
)

In [48]:
import pytorch_lightning as pl

rtd_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [49]:
#%debug
rtd_trainer.fit(rtd_model, rtd_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type          | Params
-----------------------------------------------------
0 | _loss              | BCELoss       | 0     
1 | _seq_encoder       | RnnSeqEncoder | 352 K 
2 | _validation_metric | BinaryAUROC   | 0     
3 | _head              | Sequential    | 257   
-----------------------------------------------------
352 K     Trainable params
0         Non-trainable params
352 K     Total params
1.410     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

### MLM

In [41]:
from ptls.frames.bert import MLMPretrainModule
from ptls.nn import RnnEncoder

In [44]:
mlm_ptls_emb_layer = PtlsEmbeddingLayer(None,
                                        cat_embedding_projections,
                                        cat_features_names,
                                        num_embedding_projections,
                                        num_features_names).cuda()

mlm_seq_encoder = RnnEncoder(
    #trx_encoder=mlm_ptls_emb_layer,
    input_size=mlm_ptls_emb_layer.get_embedding_size(),
    is_reduce_sequence=False,
    hidden_size=182,
    type='gru',
).cuda()

mlm_model = MLMPretrainModule(
    trx_encoder=mlm_ptls_emb_layer, 
    seq_encoder=mlm_seq_encoder,
    total_steps=10000
).cuda()

In [45]:
count_parameters(mlm_model)

214324

In [34]:
mlm_dataloader = torch.utils.data.DataLoader(
    dataset,
    collate_fn=partial(my_collate_fn, splitter=None, rep=1, mode='mlm'),
    num_workers=0,
    batch_size=1
)

In [35]:
import pytorch_lightning as pl

mlm_trainer = pl.Trainer(
    max_epochs=15,
    gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [36]:
mlm_trainer.fit(mlm_model, mlm_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type               | Params
-------------------------------------------------------
0 | trx_encoder     | PtlsEmbeddingLayer | 14.1 K
1 | _seq_encoder    | RnnEncoder         | 200 K 
2 | fn_norm_predict | PBShell            | 0     
3 | loss_fn         | QuerySoftmaxLoss   | 0     
4 | train_mlm_loss  | MeanMetric         | 0     
5 | valid_mlm_loss  | MeanMetric         | 0     
-------------------------------------------------------
214 K     Trainable params
0         Non-trainable params
214 K     Total params
0.857     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]