In [98]:
%load_ext autoreload
%autoreload 2

import os
import torch 
import pickle
from torch.utils.data import IterableDataset, DataLoader
from models import TransactionsModel
from data_generators import batches_generator, cat_features_names, num_features_names, meta_features_names

from embedding import EmbeddingLayer
from ptls.frames import PtlsDataModule
from ptls.frames.bert import MLMPretrainModule
from ptls.frames.coles import CoLESModule, ColesIterableDataset

from ptls.nn import TransformerEncoder
from functools import partial
from collections import namedtuple

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
with open('./assets/num_embedding_projections.pkl', 'rb') as f:
    num_embedding_projections = pickle.load(f)
    
with open('./assets/cat_embedding_projections.pkl', 'rb') as f:
    cat_embedding_projections = pickle.load(f)

with open('./assets/meta_embedding_projections.pkl', 'rb') as f:
    meta_embedding_projections = pickle.load(f)

In [100]:
class PaddedBatch:
    def __init__(self, data):
        self.payload = data
        self.seq_lens = [data.shape[1]] * data.shape[0]

In [101]:
class IterDataset(IterableDataset):
    def __init__(self, generator):
        self.generator = generator

    def __iter__(self):
        return self.generator
    
    def collate_fn(self, x):
        return x[0]
    
    # def collate_fn(self, x):
    #     label_len = x[0]['mask'].shape[0]
    #     return x[0], torch.arange(label_len)

In [87]:
path_to_dataset = '/home/jovyan/data/alfa/train_buckets'


dir_with_datasets = os.listdir(path_to_dataset)
dataset_train = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [88]:
train_dataloader = batches_generator(dataset_train, batch_size=128, shuffle=True,
                                            device=device, is_train=True, output_format='torch')

In [89]:
d = IterDataset(train_dataloader)

In [90]:
dataloader = DataLoader(d, batch_size=1)

In [91]:
train_dl = PtlsDataModule(
    train_data=d)

In [92]:
b = next(iter(train_dl.train_dataloader()))

In [93]:
class PtlsEmbeddingLayer(EmbeddingLayer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def forward(self, batch):
        x = super().forward(batch)
        return PaddedBatch(x)

In [94]:
ptls_emb_layer = PtlsEmbeddingLayer(cat_embedding_projections,
                                    cat_features_names,
                                    num_embedding_projections,
                                    num_features_names,
                                    meta_embedding_projections,
                                    meta_features_names).cuda()

In [95]:
m = TransformerEncoder(input_size=ptls_emb_layer.get_embedding_size(), n_heads=2)

In [96]:
model = MLMPretrainModule(
    trx_encoder=ptls_emb_layer,
    hidden_size=ptls_emb_layer.get_embedding_size(),
    seq_encoder=seq_encoder,
    total_steps=10
)

In [None]:
from ptls.nn import RnnSeqEncoder

seq_encoder = RnnSeqEncoder(
    input_size=ptls_emb_layer.get_embedding_size(),
    trx_encoder=ptls_emb_layer,
    hidden_size=256,
    type='gru',
).cuda()

In [66]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [67]:
trainer.fit(model, train_dl)


Missing logger folder: /home/jovyan/romashka/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 355 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
355 K     Trainable params
0         Non-trainable params
355 K     Total params
1.422     Total estimated model params size (MB)


RuntimeError: selected index k out of range