In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import os
import sys 
import tqdm
import numpy as np
sys.path.append('../')


from functools import partial
from collections import namedtuple
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig

In [37]:
# !pip install lightgbm
# !pip install catboost

In [38]:
import torch

from src.transactions_qa.tqa_model import TransactionQAModel
from src.models.components.models import TransactionsModel
from src.utils.tools import (make_time_batch, 
                   calculate_embedding_size)

from src.data.alfa.components import ( 
                             cat_features_names, 
                             num_features_names, 
                             meta_features_names)

from src.data import AlfaDataModule 
from src.transactions_qa.tqa_model import TransactionQAModel
from src.transactions_qa.utils import get_projections_maps, get_exponent_number, get_mantissa_number
from src.tasks import AbstractTask, AutoTask
from src.transactions_qa.utils import get_split_indices,  prepare_splitted_batch, collate_batch_dict

In [39]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [40]:
def load_transaction_model(encoder_type='whisper/tiny', head_type='next'):
    projections_maps = get_projections_maps(relative_folder="..")
    # Loading Transactions model & weights
    print(f"Loading Transactions model...")

    transactions_model_encoder_type = encoder_type
    transactions_model_head_type = head_type


    transactions_model_config = {
        "cat_features": cat_features_names,
        "cat_embedding_projections": projections_maps.get('cat_embedding_projections'),
        "num_features": num_features_names,
        "num_embedding_projections": projections_maps.get('num_embedding_projections'),
        "meta_features": meta_features_names,
        "meta_embedding_projections": projections_maps.get('meta_embedding_projections'),
        "encoder_type": transactions_model_encoder_type,
        "head_type": transactions_model_head_type,
        "embedding_dropout": 0.1
    }
    transactions_model = TransactionsModel(**transactions_model_config)

    return transactions_model, projections_maps

In [41]:
def load_datamodule():
    DATA_PATH = '/home/jovyan/romashka/data' 
    dataset_config = {
                'data_dir': DATA_PATH,
                'batch_size': 32,
                'min_seq_len': 0,
                'max_seq_len': 250,
                'shuffle': True,
                'num_workers': 5,
                'pin_memory': True,
                'seed': 42
    }    

    dm = AlfaDataModule(**dataset_config)
    return dm

In [42]:
def load_tasks(task_names, tokenizer):
    # Create tasks
    tasks = []
    tasks_kwargs = [{"num_options": 6, "floating_threshold": False, 'answer2text': True, 'use_numerical_output': False}, 
    {"num_options": 6, "floating_threshold": False, 'use_numerical_output': False}] # ground truth + 5 additional options
    if isinstance(task_names, str):
        task_names = eval(task_names)
    if isinstance(tasks_kwargs, str):
        tasks_kwargs = eval(tasks_kwargs)
    print(f"Got task_names: {task_names} with task_kwargs: {tasks_kwargs}")

    for task_i, task_name in enumerate(task_names):
        task_kwargs = tasks_kwargs[task_i] if task_i < len(tasks_kwargs) else {}
        if "tokenizer" not in task_kwargs:
            task_kwargs['tokenizer'] = tokenizer
        task = AutoTask.get(task_name=task_name, **task_kwargs)
        tasks.append(task)
    print(f"Created {len(tasks)} tasks.")
    return tasks

In [43]:
def load_language_model(language_model_name_or_path="google/flan-t5-small"):
    use_fast_tokenizer = True

    print(f"Loading Language model: `{language_model_name_or_path}`...")
    config_kwargs = {
        "use_auth_token": None,
        "return_unused_kwargs": True
    }

    tokenizer_kwargs = {
        "use_fast": use_fast_tokenizer,
        "use_auth_token": None,
        "do_lowercase": False
    }

    config, unused_kwargs = AutoConfig.from_pretrained(
        language_model_name_or_path, **config_kwargs
    )
    # Download vocabulary from huggingface.co and define model-specific arguments
    tokenizer = AutoTokenizer.from_pretrained(language_model_name_or_path, **tokenizer_kwargs)

    # Download model from huggingface.co and cache.
    lm_model = AutoModelForSeq2SeqLM.from_pretrained(
        language_model_name_or_path,
        config=config
    )
    return lm_model, tokenizer

In [44]:
device = 'cuda:0'
task_names = ['next_amnt_open_ended']
LM_NAME = 'google/flan-t5-small'

transactions_model, projections_maps = load_transaction_model()
dm = load_datamodule()

lm_model, tokenizer = load_language_model(language_model_name_or_path=LM_NAME)

ckpt = torch.load("/home/jovyan/checkpoints/transactions_model/final_model_v2.ckpt", map_location='cpu')
transactions_model.load_state_dict(ckpt, strict=False)
transactions_model = transactions_model.to(device)


tasks = load_tasks(task_names, tokenizer)

Loading Transactions model...
USING whisper
Loading Language model: `google/flan-t5-small`...


2023-06-19 17:25:12,323 - [INFO] - Tasks - (task_abstract.py).generate_question_templates(206) - Given 5 starting options and 1 ending options results in 5 total combinations.
2023-06-19 17:25:12,325 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(173) - Added to tokenizer: 2 tokens.
2023-06-19 17:25:12,326 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(179) - Notice: resize_token_embeddings of a model to adapt to the size of the new vocabulary!


Got task_names: ['next_amnt_open_ended'] with task_kwargs: [{'num_options': 6, 'floating_threshold': False, 'answer2text': True, 'use_numerical_output': False}, {'num_options': 6, 'floating_threshold': False, 'use_numerical_output': False}]
Created 1 tasks.


In [48]:
train_data = []
train_labels = []

val_data = []
val_labels = []

with torch.no_grad():
    for batch in tqdm.tqdm(dm.train_dataloader()):
        
        for key in batch:
            batch[key] = batch[key].to(device)

        batch_size = batch['mask'].shape[0]
        transactions_model.eval()
        new_batch = tasks[0].prepare_task_batch(batch)
        embs, mask = transactions_model.get_embs(new_batch)
        trx_index = mask.sum(1) - 1
        train_data.append(embs[torch.arange(batch_size, device=device), trx_index])
        train_labels.append(new_batch['label'])
    
    for batch in tqdm.tqdm(dm.val_dataloader()):
        for key in batch:
            batch[key] = batch[key].to(device)
        
        batch_size = batch['mask'].shape[0]
        transactions_model.eval()
        new_batch = tasks[0].prepare_task_batch(batch)
        embs, mask = transactions_model.get_embs(new_batch)
        trx_index = mask.sum(1) - 1
        val_data.append(embs[torch.arange(batch_size, device=device), trx_index])
        val_labels.append(new_batch['label'])


16315it [02:29, 109.48it/s]
1811it [00:15, 114.05it/s]


In [49]:
train_embeds = torch.vstack(train_data).cpu().numpy()
train_labels = torch.cat(train_labels).cpu().numpy()

val_embeds = torch.vstack(val_data).cpu().numpy()
val_labels = torch.cat(val_labels).cpu().numpy()


### Boosting

In [53]:
#!pip install catboost

from catboost import CatBoostClassifier, CatBoostRegressor, metrics, cv, Pool

In [54]:
cv_dataset = Pool(data=train_embeds,
                  label=train_labels)

In [56]:
params = {"iterations": 100,
          "depth": 2,
          "loss_function": "MAE",
          "verbose": False}

scores = cv(cv_dataset,
            params,
            fold_count=2)

Training on fold [0/2]

bestTest = 0.08435387037
bestIteration = 99

Training on fold [1/2]

bestTest = 0.08446942171
bestIteration = 99



In [57]:
scores

Unnamed: 0,iterations,test-MAE-mean,test-MAE-std,train-MAE-mean,train-MAE-std
0,0,0.397069,0.000076,0.397069,0.000085
1,1,0.385173,0.000077,0.385172,0.000080
2,2,0.373608,0.000081,0.373607,0.000076
3,3,0.362380,0.000150,0.362380,0.000141
4,4,0.351625,0.000149,0.351624,0.000135
...,...,...,...,...,...
95,95,0.084945,0.000080,0.084931,0.000004
96,96,0.084805,0.000075,0.084790,0.000010
97,97,0.084663,0.000074,0.084649,0.000012
98,98,0.084539,0.000081,0.084524,0.000008


In [55]:
CatBoostModel_emb = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    custom_metric=[metrics.MAE()],
    random_seed=42,
    depth=5
)

In [None]:
CatBoostModel_emb.fit(
    train_embeds, train_labels,
    plot=True,
    logging_level='Verbose',  # you can uncomment this for text output
)

### Linear Model

In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

In [59]:
lm = LinearRegression(n_jobs=-1)

In [60]:
lm.fit(train_embeds, train_labels)

LinearRegression(n_jobs=-1)

In [61]:
cross_val_score(lm, train_embeds, train_labels)

array([0.23643428, 0.23822398, 0.23560267, 0.23263571, 0.23815126])

In [21]:
val_pred = lm.predict(val_embeds)

NameError: name 'val_embeds' is not defined

In [22]:
mean_absolute_error(val_pred, val_labels)

NameError: name 'val_pred' is not defined

### Random Forest

In [23]:
dtree = RandomForestRegressor(n_jobs=-1, verbose=1)

In [24]:
dtree.fit(train_embeds, train_labels)

NameError: name 'train_embeds' is not defined

### MLP

In [26]:
mlp = MLPRegressor(hidden_layer_sizes=(256, 256, 256))

In [30]:
batch = np.random.randn(100, 256)
y = np.random.randn(100)

In [31]:
mlp.fit(batch, y)

MLPRegressor(hidden_layer_sizes=(256, 256, 256))

In [34]:
transactions_model.output_size

NameError: name 'transactions_model' is not defined

In [46]:
from catboost import cv