In [2]:
%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import seaborn as sns
import tqdm
from matplotlib import pyplot as plt

%matplotlib inline

sns.set(style='whitegrid')
import matplotlib
matplotlib.rcParams["figure.dpi"] = 300
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'

In [3]:
import sys 
sys.path.append('../')

In [4]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pickle
import random
import numpy as np
import pandas as pd
import dataclasses
from pathlib import Path
from pprint import pprint

import torch
import transformers
import torch.nn as nn
import pytorch_lightning as pl

from torch.utils.data import IterableDataset, DataLoader

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig

import tqdm 
from tqdm.notebook import tqdm

from typing import Dict, List, Optional

os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
pd.set_option('display.max_columns', None)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Extension horovod.torch has not been built: /home/user/conda/lib/python3.7/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-37m-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.


In [5]:
from romashka.models import TransactionsModel
from romashka.tools import (make_time_batch, 
                   calculate_embedding_size)

from romashka.data_generators import (batches_generator, 
                             cat_features_names, 
                             num_features_names, 
                             meta_features_names)

from romashka.pl_dataloader import TransactionQADataset, TransactionQADataModule
from romashka.transactions_qa.tqa_model import TransactionQAModel
from romashka.transactions_qa.utils import get_projections_maps
from romashka.transactions_qa.tasks import AbstractTask, AutoTask

### Base loading

In [6]:
projections_maps = get_projections_maps(relative_folder="../romashka")

print(f"\nCategorical embeddings projections:")   
pprint(projections_maps['cat_embedding_projections'])

print(f"\nNumeric embeddings projections:")   
pprint(projections_maps['num_embedding_projections'])

print(f"\nMeta embeddings projections:")   
pprint(projections_maps['meta_embedding_projections'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print('Using device:', device)


Categorical embeddings projections:
{'card_type': (175, 29),
 'city': (163, 28),
 'country': (24, 9),
 'currency': (11, 6),
 'day_of_week': (7, 5),
 'ecommerce_flag': (3, 3),
 'hour': (24, 9),
 'income_flag': (3, 3),
 'mcc': (108, 22),
 'mcc_category': (28, 10),
 'operation_kind': (7, 5),
 'operation_type': (22, 9),
 'operation_type_group': (4, 3),
 'payment_system': (7, 5),
 'weekofyear': (53, 15)}

Numeric embeddings projections:
{'amnt': (10, 6), 'days_before': (23, 9), 'hour_diff': (10, 6)}

Meta embeddings projections:
{'product': (5, 4)}
Using device: cuda


In [7]:
# Loading Transactions model & weights
print(f"Loading Transactions model...")

transactions_model_encoder_type = "whisper/tiny"
transactions_model_head_type = "next"


transactions_model_config = {
    "cat_features": cat_features_names,
    "cat_embedding_projections": projections_maps.get('cat_embedding_projections'),
    "num_features": num_features_names,
    "num_embedding_projections": projections_maps.get('num_embedding_projections'),
    "meta_features": meta_features_names,
    "meta_embedding_projections": projections_maps.get('meta_embedding_projections'),
    "encoder_type": transactions_model_encoder_type,
    "head_type": transactions_model_head_type,
    "embedding_dropout": 0.1
}
transactions_model = TransactionsModel(**transactions_model_config)

Loading Transactions model...
USING whisper


In [8]:
# Configure and load from HF hub LM model
language_model_name_or_path = "google/flan-t5-small"
use_fast_tokenizer = True

print(f"Loading Language model: `{language_model_name_or_path}`...")
config_kwargs = {
    "use_auth_token": None,
    "return_unused_kwargs": True
}

tokenizer_kwargs = {
    "use_fast": use_fast_tokenizer,
    "use_auth_token": None,
    "do_lowercase": False
}

config, unused_kwargs = AutoConfig.from_pretrained(
    language_model_name_or_path, **config_kwargs
)
# Download vocabulary from huggingface.co and define model-specific arguments
tokenizer = AutoTokenizer.from_pretrained(language_model_name_or_path, **tokenizer_kwargs)

# Download model from huggingface.co and cache.
lm_model = AutoModelForSeq2SeqLM.from_pretrained(
    language_model_name_or_path,
    config=config
)

Loading Language model: `google/flan-t5-small`...


In [9]:
DATA_PATH = Path("../data").resolve()

TRAIN_BUCKETS_PATH = DATA_PATH / "train_buckets"
VAL_BUCKETS_PATH = DATA_PATH / "val_buckets"

n_train_files = len(list(TRAIN_BUCKETS_PATH.glob("*.pkl")))
print(f"Train contains files: {n_train_files} ")

n_val_files = len(list(VAL_BUCKETS_PATH.glob("*.pkl")))
print(f"Validation contains files: {n_val_files} ")

TRAIN_METAFILE_PATH = str(DATA_PATH / 'train.csv')
VAL_METAFILE_PATH = str(DATA_PATH / 'val.csv')

Train contains files: 10 
Validation contains files: 5 


In [10]:
data_files = {}

train_dataset_files = os.listdir(str(TRAIN_BUCKETS_PATH))
n_train_files = len(train_dataset_files)
print(f"Train contains files: {n_train_files}")
train_dataset_files = sorted([os.path.join(str(TRAIN_BUCKETS_PATH), x) for x in train_dataset_files])
data_files["train"] = train_dataset_files

for fn in train_dataset_files:
    print("\t", fn)

val_dataset_files = os.listdir(str(VAL_BUCKETS_PATH))
n_val_files = len(val_dataset_files)
print(f"\nVal contains files: {n_val_files}")
val_dataset_files = sorted([os.path.join(str(VAL_BUCKETS_PATH), x) for x in val_dataset_files])
data_files["validation"] = val_dataset_files

for fn in val_dataset_files:
    print("\t", fn)

Train contains files: 10
	 /home/jovyan/data/train_buckets/processed_chunk_000.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_001.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_002.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_003.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_004.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_005.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_006.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_007.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_008.pkl
	 /home/jovyan/data/train_buckets/processed_chunk_009.pkl

Val contains files: 5
	 /home/jovyan/data/val_buckets/processed_chunk_000.pkl
	 /home/jovyan/data/val_buckets/processed_chunk_001.pkl
	 /home/jovyan/data/val_buckets/processed_chunk_002.pkl
	 /home/jovyan/data/val_buckets/processed_chunk_003.pkl
	 /home/jovyan/data/val_buckets/processed_chunk_004.pkl


In [11]:
val_dataset_config = {
    'dataset': data_files['validation'],
    'min_seq_len': 0,
    'max_seq_len': 250,
    'seed': 42, 
    'buffer_size': 0,
    'batch_size': 32,
    'generator_batch_size': 1,
    'num_workers': 5
}

val_ds = TransactionQADataModule(val_dataset_config=val_dataset_config)

### Need to change

In [12]:
# Create tasks
tasks = []
task_names = ['default']
task_kwargs = [{"num_options": 6 }] # ground truth + 5 additional options

if isinstance(task_names, str):
    task_names = eval(task_names)
task_kwargs = task_kwargs
if isinstance(task_kwargs, str):
    task_kwargs = eval(task_kwargs)
print(f"Got task_names: {task_names} with task_kwargs: {task_kwargs}")

for task_i, task_name in enumerate(task_names):
    task_kwargs = task_kwargs[task_i] if task_i < len(task_kwargs) else {}
    if "tokenizer" not in task_kwargs:
        task_kwargs['tokenizer'] = tokenizer
    task = AutoTask.get(task_name=task_name, **task_kwargs)
    tasks.append(task)
print(f"Created {len(tasks)} tasks.")

2023-04-06 11:54:08,890 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(163) - Added to tokenizer: 2 tokens.
2023-04-06 11:54:08,891 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(169) - Notice: resize_token_embeddings of a model to adapt to the size of the new vocabulary!


Got task_names: ['default'] with task_kwargs: [{'num_options': 6}]
Created 1 tasks.


In [13]:
from transactions_qa.tqa_model import TransactionQAModel

# Create general Tranactions QA model
max_steps = 100_000
warmup_steps = 1000
do_freeze_transactions_model = True
do_freeze_language_model = True
do_freeze_connector = False

transactionsQA_model_config = {
    "warmup_steps": warmup_steps,
    "training_steps": max_steps,
    "do_freeze_tm": do_freeze_transactions_model,
    "do_freeze_lm": do_freeze_language_model,
    "do_freeze_connector": do_freeze_connector,
    "connector_input_size": 384,
}
model = TransactionQAModel(
    language_model=lm_model,
    transaction_model=transactions_model,
    tokenizer=tokenizer,
    tasks=tasks,
    **transactionsQA_model_config
)

2023-04-06 11:54:09,370 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(120) - Running in `single task` settingwith a single task: default provided.
2023-04-06 11:54:09,371 - [INFO] - TransactionQAModel - (tqa_model.py)._set_model_type(155) - Language model type: `encoder-decoder`
2023-04-06 11:54:09,372 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(165) - LM initial `num_embeddings`: 32128, `embedding_dim`: 512


Output dimension of embedding model: 384
Input dimension of autoregressive model: 512
Creating linear connector from 384 to 512 and move to device: cpu.


2023-04-06 11:54:09,611 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(177) - LM resized `num_embeddings`: 32102, `embedding_dim`: 512
2023-04-06 11:54:09,616 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(131) - Freezing transaction model's parameters...
2023-04-06 11:54:09,617 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(136) - Freezing language model's parameters...


In [14]:
ckpt = torch.load('/home/jovyan/checkpoints/checkpoints/shuffle-1k-tqa_flan-t5-small_300k-steps_default-binary/last.ckpt')

FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/checkpoints/checkpoints/shuffle-1k-tqa_flan-t5-small_300k-steps_default-binary/last.ckpt'

In [15]:
model.load_state_dict(ckpt['state_dict'])

<All keys matched successfully>

### AUC, Accuracy

In [None]:
%debug
ind_pos = model.tokenizer("Yes").input_ids[0]
ind_neg = model.tokenizer("No").input_ids[0]

list_targets = []
list_preds = [] 

with torch.no_grad():
    for batch in tqdm(val_ds.val_dataloader()):
        model.eval()
        outputs, answers = model.model_step(batch)

        targets = (answers[:, -2] == ind_pos).long()
        preds = torch.sigmoid(outputs.logits[:, 0, ind_pos] - outputs.logits[:, 0, ind_neg])

        list_targets.extend(targets.tolist())
        list_preds.extend(preds.tolist())

### Evaluate transaction model

In [71]:
from sklearn.metrics import accuracy_score

In [16]:
batch = next(iter(val_ds.val_dataloader()))

In [73]:
list_preds = []
list_targets = []

with torch.no_grad():
    transactions_model.eval()
    for batch in tqdm(val_ds.val_dataloader()):
        p = transactions_model(batch)

        trx_index = batch['mask'].sum(1, keepdim=True) - 1
        input_labels = torch.gather(batch['num_features'][0], 1, trx_index)
        target = (input_labels < 0.41).long().squeeze()

        target_feature_batch = p['num_features'][0].squeeze()
        preds = torch.gather(target_feature_batch, 1, trx_index - 1)
        pred = (preds < 0.41).long().squeeze()

        list_preds.extend(pred)
        list_targets.extend(target)

0it [00:00, ?it/s]

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [72]:
accuracy_score(list_preds, list_targets)

0.3484180497925311