In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys 
import tqdm
import numpy as np
sys.path.append('../')


from functools import partial
from collections import namedtuple
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig

In [3]:
# !pip install lightgbm
# !pip install catboost

In [4]:
import torch

from src.transactions_qa.tqa_model import TransactionQAModel
from src.models.components.models import TransactionsModel
from src.utils.tools import (make_time_batch, 
                   calculate_embedding_size)

from src.data.alfa.components import ( 
                             cat_features_names, 
                             num_features_names, 
                             meta_features_names)

from src.data import AlfaDataModule 
from src.transactions_qa.tqa_model import TransactionQAModel
from src.transactions_qa.utils import get_projections_maps, get_exponent_number, get_mantissa_number
from src.tasks import AbstractTask, AutoTask
from src.transactions_qa.utils import get_split_indices,  prepare_splitted_batch, collate_batch_dict

In [5]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [6]:
def load_transaction_model(encoder_type='whisper/tiny', head_type='next'):
    projections_maps = get_projections_maps(relative_folder="..")
    # Loading Transactions model & weights
    print(f"Loading Transactions model...")

    transactions_model_encoder_type = encoder_type
    transactions_model_head_type = head_type


    transactions_model_config = {
        "cat_features": cat_features_names,
        "cat_embedding_projections": projections_maps.get('cat_embedding_projections'),
        "num_features": num_features_names,
        "num_embedding_projections": projections_maps.get('num_embedding_projections'),
        "meta_features": meta_features_names,
        "meta_embedding_projections": projections_maps.get('meta_embedding_projections'),
        "encoder_type": transactions_model_encoder_type,
        "head_type": transactions_model_head_type,
        "embedding_dropout": 0.1
    }
    transactions_model = TransactionsModel(**transactions_model_config)

    return transactions_model, projections_maps

In [7]:
def load_datamodule():
    DATA_PATH = '/home/jovyan/romashka/data' 
    dataset_config = {
                'data_dir': DATA_PATH,
                'batch_size': 32,
                'min_seq_len': 0,
                'max_seq_len': 250,
                'shuffle': False,
                'num_workers': 5,
                'pin_memory': True,
                'seed': 42
    }    

    dm = AlfaDataModule(**dataset_config)
    return dm

In [8]:
def load_tasks(task_names, tokenizer):
    # Create tasks
    tasks = []
    tasks_kwargs = [{"num_options": 6, "floating_threshold": False, 'answer2text': True, 'use_numerical_output': False}, 
    {"num_options": 6, "floating_threshold": False, 'use_numerical_output': False}] # ground truth + 5 additional options
    if isinstance(task_names, str):
        task_names = eval(task_names)
    if isinstance(tasks_kwargs, str):
        tasks_kwargs = eval(tasks_kwargs)
    print(f"Got task_names: {task_names} with task_kwargs: {tasks_kwargs}")

    for task_i, task_name in enumerate(task_names):
        task_kwargs = tasks_kwargs[task_i] if task_i < len(tasks_kwargs) else {}
        if "tokenizer" not in task_kwargs:
            task_kwargs['tokenizer'] = tokenizer
        task = AutoTask.get(task_name=task_name, **task_kwargs)
        tasks.append(task)
    print(f"Created {len(tasks)} tasks.")
    return tasks

In [9]:
def load_language_model(language_model_name_or_path="google/flan-t5-small"):
    use_fast_tokenizer = True

    print(f"Loading Language model: `{language_model_name_or_path}`...")
    config_kwargs = {
        "use_auth_token": None,
        "return_unused_kwargs": True
    }

    tokenizer_kwargs = {
        "use_fast": use_fast_tokenizer,
        "use_auth_token": None,
        "do_lowercase": False
    }

    config, unused_kwargs = AutoConfig.from_pretrained(
        language_model_name_or_path, **config_kwargs
    )
    # Download vocabulary from huggingface.co and define model-specific arguments
    tokenizer = AutoTokenizer.from_pretrained(language_model_name_or_path, **tokenizer_kwargs)

    # Download model from huggingface.co and cache.
    lm_model = AutoModelForSeq2SeqLM.from_pretrained(
        language_model_name_or_path,
        config=config
    )
    return lm_model, tokenizer

In [10]:
device = 'cuda:0'
task_names = ['next_amnt_open_ended']
LM_NAME = 'google/flan-t5-small'

transactions_model, projections_maps = load_transaction_model()
dm = load_datamodule()

lm_model, tokenizer = load_language_model(language_model_name_or_path=LM_NAME)

ckpt = torch.load("/home/jovyan/checkpoints/transactions_model/final_model_v2.ckpt", map_location='cpu')
transactions_model.load_state_dict(ckpt, strict=False)
transactions_model = transactions_model.to(device)


tasks = load_tasks(task_names, tokenizer)

Loading Transactions model...
USING whisper
Loading Language model: `google/flan-t5-small`...


2023-06-25 16:42:14,680 - [INFO] - Tasks - (task_abstract.py).generate_question_templates(206) - Given 5 starting options and 1 ending options results in 5 total combinations.
2023-06-25 16:42:14,683 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(173) - Added to tokenizer: 2 tokens.
2023-06-25 16:42:14,684 - [INFO] - Tasks - (task_abstract.py).extend_vocabulary(179) - Notice: resize_token_embeddings of a model to adapt to the size of the new vocabulary!


Got task_names: ['next_amnt_open_ended'] with task_kwargs: [{'num_options': 6, 'floating_threshold': False, 'answer2text': True, 'use_numerical_output': False}, {'num_options': 6, 'floating_threshold': False, 'use_numerical_output': False}]
Created 1 tasks.


In [11]:
train_data = []
train_labels = []

val_data = []
val_labels = []

with torch.no_grad():
    for batch in tqdm.tqdm(dm.train_dataloader()):
        
        for key in batch:
            batch[key] = batch[key].to(device)

        batch_size = batch['mask'].shape[0]
        transactions_model.eval()
        new_batch = tasks[0].prepare_task_batch(batch)
        embs, mask = transactions_model.get_embs(new_batch)
        trx_index = mask.sum(1) - 1
        train_data.append(embs[torch.arange(batch_size, device=device), trx_index])
        train_labels.append(new_batch['label'])
    
    for batch in tqdm.tqdm(dm.val_dataloader()):
        for key in batch:
            batch[key] = batch[key].to(device)
        
        batch_size = batch['mask'].shape[0]
        transactions_model.eval()
        new_batch = tasks[0].prepare_task_batch(batch)
        embs, mask = transactions_model.get_embs(new_batch)
        trx_index = mask.sum(1) - 1
        val_data.append(embs[torch.arange(batch_size, device=device), trx_index])
        val_labels.append(new_batch['label'])


16315it [04:07, 65.80it/s] 
1811it [00:21, 85.94it/s] 


In [12]:
dm.shuffle

False

In [13]:
train_embeds = torch.vstack(train_data).cpu().numpy()
train_labels = torch.cat(train_labels).cpu().numpy()

val_embeds = torch.vstack(val_data).cpu().numpy()
val_labels = torch.cat(val_labels).cpu().numpy()

In [14]:
# np.save('/home/jovyan/romashka/assets/boosting_embeds/train_embeds.npy', train_embeds)
# np.save('/home/jovyan/romashka/assets/boosting_embeds/val_embeds.npy', val_embeds)

### Boosting

In [84]:
#!pip install catboost

from sklearn.base import BaseEstimator

from catboost import CatBoostClassifier, CatBoostRegressor, metrics, cv, Pool

In [85]:
class MyCatBoost(CatBoostRegressor, BaseEstimator):
    def __init__(self, random_state=0):
        super().__init__()
        self.random_state = random_state

      4        [36m0.0104[0m        [32m0.0095[0m  253.3617
      4        [36m0.0104[0m        [32m0.0096[0m  255.8253
      3        [36m0.0099[0m        [32m0.0104[0m  400.7570
      3        [36m0.0099[0m        [32m0.0105[0m  392.0843
      3        [36m0.0099[0m        [32m0.0103[0m  402.2128
      3        [36m0.0099[0m        [32m0.0103[0m  405.7684
      3        [36m0.0099[0m        [32m0.0102[0m  403.0029
      3        [36m0.0100[0m        [32m0.0104[0m  407.7536
      3        [36m0.0099[0m        [32m0.0102[0m  412.7552
      3        [36m0.0099[0m        [32m0.0102[0m  404.5129
      3        [36m0.0100[0m        [32m0.0103[0m  409.5076
      3        [36m0.0099[0m        [32m0.0102[0m  410.9874
      5        [36m0.0104[0m        [32m0.0094[0m  250.6669
      5        [36m0.0103[0m        [32m0.0095[0m  251.5339
      6        [36m0.0103[0m        [32m0.0095[0m  244.7502
      6        [36m0.0103[0m        [

In [54]:
# cv_dataset = Pool(data=train_embeds,
#                   label=train_labels)

In [None]:
# params = {"iterations": 100,
#           "depth": 2,
#           "loss_function": "MAE",
#           "verbose": False}

# scores = cv(cv_dataset,
#             params,
#             fold_count=2)

In [57]:
scores

Unnamed: 0,iterations,test-MAE-mean,test-MAE-std,train-MAE-mean,train-MAE-std
0,0,0.397069,0.000076,0.397069,0.000085
1,1,0.385173,0.000077,0.385172,0.000080
2,2,0.373608,0.000081,0.373607,0.000076
3,3,0.362380,0.000150,0.362380,0.000141
4,4,0.351625,0.000149,0.351624,0.000135
...,...,...,...,...,...
95,95,0.084945,0.000080,0.084931,0.000004
96,96,0.084805,0.000075,0.084790,0.000010
97,97,0.084663,0.000074,0.084649,0.000012
98,98,0.084539,0.000081,0.084524,0.000008


In [None]:
do

In [55]:
CatBoostModel_emb = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    custom_metric=[metrics.MAE()],
    random_seed=42,
    depth=5
)

In [None]:
CatBoostModel_emb.fit(
    train_embeds, train_labels,
    plot=True,
    logging_level='Verbose',  # you can uncomment this for text output
)

### Sklearn models

In [24]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

In [45]:
# model = LinearRegression(n_jobs=-1)
model = Ridge()
model_params = {'alpha': optuna.distributions.UniformDistribution(low=0.1, high=100)}

# model = MLPRegressor(hidden_layer_sizes=(256, 256, 256))

model = RandomForestRegressor(n_jobs=-1)

model_params = {
    'n_estimators': optuna.distributions.IntUniformDistribution(50, 1000),
    'max_depth':  optuna.distributions.IntUniformDistribution(4, 50),
    'min_samples_split': optuna.distributions.IntUniformDistribution(1, 150),
    'min_samples_leaf':  optuna.distributions.IntUniformDistribution(1, 60),
}

[32m[I 2023-06-22 12:56:25,194][0m Trial 4 finished with value: -0.06621766998259493 and parameters: {'n_estimators': 191, 'max_depth': 14, 'min_samples_split': 41, 'min_samples_leaf': 47}. Best is trial 4 with value: -0.06621766998259493.[0m


In [48]:
from optuna.integration import OptunaSearchCV

optuna_search = OptunaSearchCV(model, model_params, cv=5, n_jobs=-1, n_trials=2)

optuna_search.fit(train_embeds[:100], train_labels[:100])
y_pred = optuna_search.predict(val_embeds)

  This is separate from the ipykernel package so we can avoid doing imports until
[32m[I 2023-06-22 13:06:10,829][0m A new study created in memory with name: no-name-4f04d92f-6666-4d10-aee4-80c4bf17b6fa[0m
[32m[I 2023-06-22 13:06:20,561][0m Trial 1 finished with value: -0.06682577887790586 and parameters: {'n_estimators': 830, 'max_depth': 35, 'min_samples_split': 105, 'min_samples_leaf': 26}. Best is trial 1 with value: -0.06682577887790586.[0m
[32m[I 2023-06-22 13:06:20,596][0m Trial 0 finished with value: -0.06801305764369374 and parameters: {'n_estimators': 961, 'max_depth': 19, 'min_samples_split': 100, 'min_samples_leaf': 10}. Best is trial 1 with value: -0.06682577887790586.[0m


In [51]:
mean_absolute_error(y_pred, val_labels)

0.09486297043245859

In [21]:
# lm.fit(train_embeds, train_labels)
# val_pred = model.predict(val_embeds)

NameError: name 'val_embeds' is not defined

### Skorch

In [15]:
# !pip install skorch

Collecting skorch
  Downloading skorch-0.13.0-py3-none-any.whl (209 kB)
     |████████████████████████████████| 209 kB 2.1 MB/s            
Installing collected packages: skorch
Successfully installed skorch-0.13.0


In [35]:
import torch.nn as nn 
import torch.nn.functional as F
import optuna

from skorch import NeuralNetRegressor

In [22]:
class SimpleMLP(nn.Module):
    def __init__(
            self,
            inp_size,
            hidden_size,
            nonlin=F.relu,
    ):
        super().__init__()

        self.inp_size = inp_size
        self.nonlin = nonlin
        self.hidden_size = hidden_size

        self.linear1 = nn.Linear(inp_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)

    def forward(self, X, **kwargs):
        X = F.relu(self.linear1(X))
        X = self.linear2(X)
        return X

In [66]:
# my_nn = partial(SimpleMLP, inp_size=train_embeds.shape[1])

model_params = {'module__hidden_size': optuna.distributions.IntUniformDistribution(10, 150)}

In [76]:
net = NeuralNetRegressor( 
    SimpleMLP,
    max_epochs=20,
    module__hidden_size=10,
    module__inp_size=384,
    lr=0.1,
    device='cuda',
      # uncomment this to train with CUDA
)

In [None]:
optuna_search = optuna.integration.OptunaSearchCV(net, model_params, cv=2, n_jobs=-1, n_trials=2, verbose=1)
optuna_search.fit(train_embeds[:100], train_labels[:100].reshape(-1, 1))

In [59]:
# net.fit(train_embeds, train_labels.reshape(-1, 1))

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m0.0103[0m        [32m0.0095[0m  7.9826


<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=SimpleMLP(
    (linear1): Linear(in_features=384, out_features=10, bias=True)
    (linear2): Linear(in_features=10, out_features=1, bias=True)
  ),
)