In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import seaborn as sns
import tqdm
from matplotlib import pyplot as plt

%matplotlib inline

sns.set(style='whitegrid')
import matplotlib
matplotlib.rcParams["figure.dpi"] = 300
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'

In [2]:
import sys 
sys.path.append('../')

In [3]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from pathlib import Path
from pprint import pprint

import torch
import transformers
import torch.nn as nn
import pytorch_lightning as pl

from torch.utils.data import IterableDataset, DataLoader

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig

import tqdm 
from tqdm.notebook import tqdm

from typing import Dict, List, Optional

os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
pd.set_option('display.max_columns', None)


In [4]:
from src.transactions_qa.tqa_model import TransactionQAModel
from src.models.components.models import TransactionsModel
from src.utils.tools import (make_time_batch, 
                   calculate_embedding_size)

from src.data.alfa.components import ( 
                             cat_features_names, 
                             num_features_names, 
                             meta_features_names)

from src.data import AlfaDataModule 
from src.transactions_qa.tqa_model import TransactionQAModel
from src.transactions_qa.utils import get_projections_maps
from src.tasks import AbstractTask, AutoTask

### Loading transaction model

In [5]:
def load_transaction_model(encoder_type='whisper/tiny', head_type='next'):
    projections_maps = get_projections_maps(relative_folder="..")
    # Loading Transactions model & weights
    print(f"Loading Transactions model...")

    transactions_model_encoder_type = encoder_type
    transactions_model_head_type = head_type


    transactions_model_config = {
        "cat_features": cat_features_names,
        "cat_embedding_projections": projections_maps.get('cat_embedding_projections'),
        "num_features": num_features_names,
        "num_embedding_projections": projections_maps.get('num_embedding_projections'),
        "meta_features": meta_features_names,
        "meta_embedding_projections": projections_maps.get('meta_embedding_projections'),
        "encoder_type": transactions_model_encoder_type,
        "head_type": transactions_model_head_type,
        "embedding_dropout": 0.1
    }
    transactions_model = TransactionsModel(**transactions_model_config)

    return transactions_model, projections_maps

In [6]:
def load_language_model(language_model_name_or_path="google/flan-t5-small"):
    use_fast_tokenizer = True

    print(f"Loading Language model: `{language_model_name_or_path}`...")
    config_kwargs = {
        "use_auth_token": None,
        "return_unused_kwargs": True
    }

    tokenizer_kwargs = {
        "use_fast": use_fast_tokenizer,
        "use_auth_token": None,
        "do_lowercase": False
    }

    config, unused_kwargs = AutoConfig.from_pretrained(
        language_model_name_or_path, **config_kwargs
    )
    # Download vocabulary from huggingface.co and define model-specific arguments
    tokenizer = AutoTokenizer.from_pretrained(language_model_name_or_path, **tokenizer_kwargs)

    # Download model from huggingface.co and cache.
    lm_model = AutoModelForSeq2SeqLM.from_pretrained(
        language_model_name_or_path,
        config=config
    )
    return lm_model, tokenizer

In [7]:
def load_datamodule():
    DATA_PATH = '/home/jovyan/romashka/data' 
    dataset_config = {
                'data_dir': DATA_PATH,
                'batch_size': 32,
                'min_seq_len': 0,
                'max_seq_len': 250,
                'shuffle': True,
                'num_workers': 5,
                'pin_memory': True,
                'seed': 42
    }    

    dm = AlfaDataModule(**dataset_config)
    return dm

In [16]:
def load_tasks(task_names, tokenizer):
    # Create tasks
    tasks = []
    task_kwargs = [{"num_options": 6 }] # ground truth + 5 additional options
    if isinstance(task_names, str):
        task_names = eval(task_names)
    task_kwargs = task_kwargs
    if isinstance(task_kwargs, str):
        task_kwargs = eval(task_kwargs)
    print(f"Got task_names: {task_names} with task_kwargs: {task_kwargs}")

    for task_i, task_name in enumerate(task_names):
        task_kwargs = task_kwargs[task_i] if task_i < len(task_kwargs) else {}
        if "tokenizer" not in task_kwargs:
            task_kwargs['tokenizer'] = tokenizer
        task_kwargs['use_numerical'] = True    
        task = AutoTask.get(task_name=task_name, **task_kwargs)
        tasks.append(task)
    print(f"Created {len(tasks)} tasks.")
    return tasks

In [17]:
def make_tqa_model(lm_model, transactions_model, tokenizer, tasks):
    # Create general Tranactions QA model
    max_steps = 100_000
    warmup_steps = 1000
    do_freeze_transactions_model = True
    do_freeze_language_model = True
    do_freeze_connector = False

    transactionsQA_model_config = {
        "warmup_steps": warmup_steps,
        "training_steps": max_steps,
        "do_freeze_tm": do_freeze_transactions_model,
        "do_freeze_lm": do_freeze_language_model,
        "do_freeze_connector": do_freeze_connector,
        "connector_input_size": 384,
        "use_numerical": True
    }

    model = TransactionQAModel(
        language_model=lm_model,
        transaction_model=transactions_model,
        tokenizer=tokenizer,
        tasks=tasks,
        **transactionsQA_model_config
    )
    return model

In [18]:
task_names = ['next_amnt_open_ended']
LM_NAME = 'google/flan-t5-small'

lm_model, tokenizer = load_language_model(language_model_name_or_path=LM_NAME)
transactions_model, projections_maps = load_transaction_model()

tasks = load_tasks(task_names, tokenizer)
dm = load_datamodule()

tqa_model = make_tqa_model(lm_model, transactions_model, tokenizer, tasks)

Loading Language model: `google/flan-t5-small`...
Loading Transactions model...


2023-05-21 14:34:13,559 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(170) - Added to tokenizer: 3 tokens.
2023-05-21 14:34:13,561 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(176) - Notice: resize_token_embeddings of a model to adapt to the size of the new vocabulary!
2023-05-21 14:34:13,572 - [INFO] - TransactionQAModel - (tqa_model.py).__init__(73) - Setuping metrics.
2023-05-21 14:34:13,572 - [INFO] - TransactionQAModel - (tqa_model.py).__init__(73) - Setuping metrics.
2023-05-21 14:34:13,572 - [INFO] - TransactionQAModel - (tqa_model.py).__init__(73) - Setuping metrics.
2023-05-21 14:34:13,577 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(166) - Running in `single task` settingwith a single task: next_amnt_open_ended provided.
2023-05-21 14:34:13,577 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(166) - Running in `single task` settingwith a single task: next_amnt_open_ended provided.
2023-05-21 14:34:13,577 - [INFO] - Transactio

USING whisper
Got task_names: ['next_amnt_open_ended'] with task_kwargs: [{'num_options': 6}]
Created 1 tasks.
Output dimension of embedding model: 384
Input dimension of autoregressive model: 512
Creating linear connector from 384 to 512 and move to device: cpu.
ModuleDict(
  (next_amnt_open_ended): ModuleDict(
    (mae): MeanAbsoluteError()
  )
)


2023-05-21 14:34:13,805 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(223) - LM resized `num_embeddings`: 32103, `embedding_dim`: 512
2023-05-21 14:34:13,805 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(223) - LM resized `num_embeddings`: 32103, `embedding_dim`: 512
2023-05-21 14:34:13,805 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(223) - LM resized `num_embeddings`: 32103, `embedding_dim`: 512
2023-05-21 14:34:13,811 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(177) - Freezing transaction model's parameters...
2023-05-21 14:34:13,811 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(177) - Freezing transaction model's parameters...
2023-05-21 14:34:13,811 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(177) - Freezing transaction model's parameters...
2023-05-21 14:34:13,813 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(182) - Freezing language mode

In [19]:
ckpt = torch.load('/home/jovyan/romashka/checkpoints/checkpoints/tqa_200k-steps_ft=all_numerical_next_amnt_open_ended_flan-t5-small_v8/last.ckpt')

In [20]:
tqa_model.load_state_dict(ckpt['state_dict'])

<All keys matched successfully>

In [21]:
batch = next(iter(dm.val_dataloader()))

In [34]:
res = tqa_model.model_step(batch)

In [30]:
mantissa = res[0]['mantissa']
exponent = res[0]['exponent']

In [43]:
out = tasks[0].calculate_metrics(res[0], res[1], tqa_model.val_metrics[tasks[0].task_name], stage='val_')

In [54]:
preds, targets = tasks[0].process_num_outputs(res[0], res[1])

In [57]:
tqa_model.val_metrics[tasks[0].task_name]['mae'](preds, targets)

tensor(0.0516, grad_fn=<SqueezeBackward0>)

In [43]:
preds, targets = tqa_model.tasks[0].process_outputs(res[0], res[1])


# if 'accuracy' in task_metrics:
#     task_metrics['accuracy'](preds, targets)
#     metrics[self.task_name + '_accuracy'] = task_metrics['accuracy']

In [47]:
tqa_model.tasks[0].metrics['accuracy'](preds, targets)

tensor(0.)

In [27]:
# ckpt = torch.load('/home/jovyan/checkpoints/checkpoints/tqa_flan-t5-base_300k-steps_default-binary/last.ckpt')
# tqa_model.load_state_dict(ckpt['state_dict'])
# tqa_model.to('cuda');

### AUC, Accuracy

In [56]:
ind_pos = tqa_model.tokenizer("Yes").input_ids[0]
ind_neg = tqa_model.tokenizer("No").input_ids[0]

list_targets = []
list_preds = [] 

with torch.no_grad():
    for batch in tqdm(dm.val_dataloader()):
        tqa_model.eval()
        cuda_batch = {k: v.to(device='cuda', non_blocking=True) for k, v in batch.items()}
        outputs, answers = tqa_model.model_step(cuda_batch)

        targets = (answers[:, -2] == ind_pos).long()
        preds = torch.sigmoid(outputs.logits[:, 0, ind_pos] - outputs.logits[:, 0, ind_neg])

        list_targets.extend(targets.tolist())
        list_preds.extend(preds.tolist())

0it [00:00, ?it/s]

### Calculate metrics

In [57]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [58]:
hard_preds =list(map(lambda x: int(x>0.5), list_preds))

In [59]:
roc_auc_score(list_targets, list_preds)

0.7496266470785984

In [60]:
accuracy_score(list_targets, hard_preds)

0.9725179756637168

In [1]:
from transformers import Adafactor

In [1]:
import torch

In [2]:
t = torch.arange(10) * 3.5

In [13]:
list(map(lambda x: str(round(x.item(), 2)),t))

['0.0', '3.5', '7.0', '10.5', '14.0', '17.5', '21.0', '24.5', '28.0', '31.5']