In [45]:
%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import seaborn as sns
import tqdm
import re

from matplotlib import pyplot as plt

%matplotlib inline

sns.set(style='whitegrid')
import matplotlib
matplotlib.rcParams["figure.dpi"] = 300
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import sys 
sys.path.append('../')

#### Imports

In [4]:
import os
import numpy as np
import torch
import torch.nn as nn
import wandb

import tqdm
import pickle
import pytorch_lightning as pl
import random

from torch.utils.data import IterableDataset, DataLoader
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint

from functools import partial
from collections import namedtuple
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [5]:
from src.transactions_qa.tqa_model import TransactionQAModel
from src.models.components.models import TransactionsModel
from src.utils.tools import (make_time_batch, 
                   calculate_embedding_size)

from src.data.alfa.components import ( 
                             cat_features_names, 
                             num_features_names, 
                             meta_features_names)

from src.data import AlfaDataModule 
from src.transactions_qa.tqa_model import TransactionQAModel
from src.transactions_qa.utils import get_projections_maps, get_exponent_number, get_mantissa_number
from src.tasks import AbstractTask, AutoTask
from src.transactions_qa.utils import get_split_indices,  prepare_splitted_batch, collate_batch_dict

In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig

In [7]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [8]:
def load_transaction_model(encoder_type='whisper/tiny', head_type='next'):
    projections_maps = get_projections_maps(relative_folder="..")
    # Loading Transactions model & weights
    print(f"Loading Transactions model...")

    transactions_model_encoder_type = encoder_type
    transactions_model_head_type = head_type


    transactions_model_config = {
        "cat_features": cat_features_names,
        "cat_embedding_projections": projections_maps.get('cat_embedding_projections'),
        "num_features": num_features_names,
        "num_embedding_projections": projections_maps.get('num_embedding_projections'),
        "meta_features": meta_features_names,
        "meta_embedding_projections": projections_maps.get('meta_embedding_projections'),
        "encoder_type": transactions_model_encoder_type,
        "head_type": transactions_model_head_type,
        "embedding_dropout": 0.1
    }
    transactions_model = TransactionsModel(**transactions_model_config)

    return transactions_model, projections_maps

In [9]:
def load_language_model(language_model_name_or_path="google/flan-t5-small"):
    use_fast_tokenizer = True

    print(f"Loading Language model: `{language_model_name_or_path}`...")
    config_kwargs = {
        "use_auth_token": None,
        "return_unused_kwargs": True
    }

    tokenizer_kwargs = {
        "use_fast": use_fast_tokenizer,
        "use_auth_token": None,
        "do_lowercase": False
    }

    config, unused_kwargs = AutoConfig.from_pretrained(
        language_model_name_or_path, **config_kwargs
    )
    # Download vocabulary from huggingface.co and define model-specific arguments
    tokenizer = AutoTokenizer.from_pretrained(language_model_name_or_path, **tokenizer_kwargs)

    # Download model from huggingface.co and cache.
    lm_model = AutoModelForSeq2SeqLM.from_pretrained(
        language_model_name_or_path,
        config=config
    )
    return lm_model, tokenizer

In [10]:
def load_datamodule():
    DATA_PATH = '/home/jovyan/romashka/data' 
    dataset_config = {
                'data_dir': DATA_PATH,
                'batch_size': 32,
                'min_seq_len': 0,
                'max_seq_len': 250,
                'shuffle': True,
                'num_workers': 5,
                'pin_memory': True,
                'seed': 42
    }    

    dm = AlfaDataModule(**dataset_config)
    return dm

In [56]:
def load_tasks(task_names, tokenizer):
    # Create tasks
    tasks = []
    tasks_kwargs = [{"num_options": 6, "floating_threshold": True, 'answer2text': False, 'use_numerical_output': False}, 
    {"num_options": 6, "floating_threshold": False, 'use_numerical_output': False}] # ground truth + 5 additional options
    if isinstance(task_names, str):
        task_names = eval(task_names)
    if isinstance(tasks_kwargs, str):
        tasks_kwargs = eval(tasks_kwargs)
    print(f"Got task_names: {task_names} with task_kwargs: {tasks_kwargs}")

    for task_i, task_name in enumerate(task_names):
        task_kwargs = tasks_kwargs[task_i] if task_i < len(tasks_kwargs) else {}
        if "tokenizer" not in task_kwargs:
            task_kwargs['tokenizer'] = tokenizer
        task = AutoTask.get(task_name=task_name, **task_kwargs)
        tasks.append(task)
    print(f"Created {len(tasks)} tasks.")
    return tasks

In [57]:
def make_tqa_model(lm_model, transactions_model, tokenizer, tasks):
    # Create general Tranactions QA model
    max_steps = 100_000
    warmup_steps = 1000
    do_freeze_transactions_model = True
    do_freeze_language_model = True
    do_freeze_connector = False

    transactionsQA_model_config = {
        "warmup_steps": warmup_steps,
        "training_steps": max_steps,
        "do_freeze_tm": do_freeze_transactions_model,
        "do_freeze_lm": do_freeze_language_model,
        "do_freeze_connector": do_freeze_connector,
        "connector_input_size": 384,
        "use_numerical_input": False,
        "use_numerical_output": False,
        "numerical_context": "context",
    }

    model = TransactionQAModel(
        language_model=lm_model,
        transaction_model=transactions_model,
        tokenizer=tokenizer,
        tasks=tasks,
        **transactionsQA_model_config
    )
    return model

In [58]:
# task_names = ['next_transactions_30_days_binary', 'default', 'next_mcc_binary', 'next_mcc_open_ended']
task_names1 = ['next_amnt_open_ended']
task_names2 = ['next_amnt_binary']

LM_NAME = 'google/flan-t5-small'

lm_model, tokenizer = load_language_model(language_model_name_or_path=LM_NAME)
transactions_model, projections_maps = load_transaction_model()

tasks1 = load_tasks(task_names1, tokenizer)
tasks2 = load_tasks(task_names2, tokenizer)
dm = load_datamodule()

tqa_model1 = make_tqa_model(lm_model, transactions_model, tokenizer, tasks1)
tqa_model2 = make_tqa_model(lm_model, transactions_model, tokenizer, tasks2)

Loading Language model: `google/flan-t5-small`...
Loading Transactions model...


2023-06-22 11:27:22,967 - [INFO] - Tasks - (task_abstract.py).generate_question_templates(206) - Given 5 starting options and 1 ending options results in 5 total combinations.
2023-06-22 11:27:22,970 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(173) - Added to tokenizer: 2 tokens.
2023-06-22 11:27:22,971 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(179) - Notice: resize_token_embeddings of a model to adapt to the size of the new vocabulary!
2023-06-22 11:27:22,978 - [INFO] - Tasks - (task_abstract.py).generate_question_templates(206) - Given 5 starting options and 1 ending options results in 5 total combinations.
2023-06-22 11:27:22,979 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(173) - Added to tokenizer: 0 tokens.
2023-06-22 11:27:22,980 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(179) - Notice: resize_token_embeddings of a model to adapt to the size of the new vocabulary!
2023-06-22 11:27:22,982 - [INFO] - Tasks - (task_abstract.py).gene

USING whisper
Got task_names: ['next_amnt_open_ended'] with task_kwargs: [{'num_options': 6, 'floating_threshold': True, 'answer2text': False, 'use_numerical_output': False}, {'num_options': 6, 'floating_threshold': False, 'use_numerical_output': False}]
Created 1 tasks.
Got task_names: ['next_amnt_binary'] with task_kwargs: [{'num_options': 6, 'floating_threshold': True, 'answer2text': False, 'use_numerical_output': False}, {'num_options': 6, 'floating_threshold': False, 'use_numerical_output': False}]
Created 1 tasks.
Output dimension of embedding model: 384
Input dimension of autoregressive model: 512
Creating linear connector from 384 to 512 and move to device: cpu.
ModuleDict(
  (next_amnt_open_ended): ModuleDict(
    (mae): MeanAbsoluteError()
    (mape): MeanAbsolutePercentageError()
    (rouge): ROUGEScore()
  )
)


2023-06-22 11:27:23,293 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(263) - LM resized `num_embeddings`: 32102, `embedding_dim`: 512
2023-06-22 11:27:23,293 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(263) - LM resized `num_embeddings`: 32102, `embedding_dim`: 512
2023-06-22 11:27:23,293 - [INFO] - TransactionQAModel - (tqa_model.py)._resize_text_embeddings(263) - LM resized `num_embeddings`: 32102, `embedding_dim`: 512
2023-06-22 11:27:23,299 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(217) - Freezing transaction model's parameters...
2023-06-22 11:27:23,299 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(217) - Freezing transaction model's parameters...
2023-06-22 11:27:23,299 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(217) - Freezing transaction model's parameters...
2023-06-22 11:27:23,304 - [INFO] - TransactionQAModel - (tqa_model.py)._prepare_model(222) - Freezing language mode

Output dimension of embedding model: 384
Input dimension of autoregressive model: 512
Creating linear connector from 384 to 512 and move to device: cpu.
ModuleDict(
  (next_amnt_binary): ModuleDict(
    (auc): BinaryAUROC()
    (accuracy): BinaryAccuracy()
  )
)


In [59]:
# ckpt = torch.load('/home/jovyan/romashka/checkpoints/checkpoints/tqa_200k-steps_ft=all_numerical_v5/last.ckpt')['state_dict']

# tqa_model.load_state_dict(ckpt, strict=False)
# tqa_model.cuda();

### Numerical Input

In [60]:
batch = next(iter(dm.val_dataloader()))

In [71]:
import os
import contextlib

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
open_ended_results = []
binary_results = []

with torch.no_grad():
    for batch in tqdm.tqdm(dm.val_dataloader()):
        # Removing prints
        tqa_model1.eval()
        tqa_model2.eval()
        with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
            out1, batch_answers1 = tqa_model1.model_step(batch)
            pred1, true_output = tqa_model1.tasks[0].process_outputs(out1, batch_answers1)

            out2, batch_answers2 = tqa_model2.model_step(batch)
            pred2, true_output2 = tqa_model2.tasks[0].process_outputs(out2, batch_answers2)

            questions = tqa_model2.tokenizer.batch_decode(out2['question_encoded'], skip_special_tokens=True)
            thresholds = torch.tensor([float(re.findall("\d+\.\d+",  string)[0]) for string in questions])

            open_ended_predictions = (pred1 > thresholds) == true_output2
            binary_predictions = (pred2 > 0.5) == true_output2

            open_ended_results.append(open_ended_predictions)
            binary_results.append(binary_predictions)