In [None]:
# Nasty fix to make notebook load the library without pip-installing it.

import os
import sys

sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
import pandas as pd
import torch
import typing as t
from pathlib import Path

from torch.utils.data import Dataset as TorchDataset
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from kaggle_toolbox import device
from kaggle_toolbox.data import DatasetItem, DatasetItemCollator
from kaggle_toolbox.device import CUDADevice
from kaggle_toolbox.features.transform import contiguous_to_categorical
from kaggle_toolbox.logging import StdOutLogger
from kaggle_toolbox.loss.regression import MSELoss
from kaggle_toolbox.lr_scheduling import create_cosine_scheduler_with_warmup
from kaggle_toolbox.metrics.regression import MCRMSEMetric
from kaggle_toolbox.nlp.transformer import Backbone, StandardModel, ClsTokenPooler, TakeNthHiddenLayerOutputSqueezer, \
    create_nakama_optimizer, get_tokenizer_for_backbone, Tokenizer, TokenizerResult
from kaggle_toolbox.oof import OOFPredDict
from kaggle_toolbox.path import format_path
from kaggle_toolbox.trainer import StandardIterationTrainer, FullCycleTrainer, train_kfold_model
from kaggle_toolbox.typing import DynamicDict
from kaggle_toolbox.validation import analyze_val_strategy

In [None]:
class Dataset(TorchDataset[DatasetItem[TokenizerResult]]):

    def __init__(
            self,
            df: pd.DataFrame,
            tokenizer: Tokenizer,
            max_len: int,
            target_list: t.List[str]):
        self._df = df.copy().reset_index(drop=True)
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._target_list = target_list

    def _get_tokenizer_input(self, row: DynamicDict) -> str:
        (
            full_text,
         ) = (
            row.get_typed_or_raise('full_text', str),
         )

        return full_text

    def sort_by_tokenizer_input_len(self):
        self._df['_tok_input_len'] = self._df.progress_apply(self._get_tokenizer_input, axis=1)
        self._df = self._df.sort_values('_tok_input_len')

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> DatasetItem[TokenizerResult]:
        row = self._df.iloc[idx]

        tokenizer_input = self._get_tokenizer_input(DynamicDict(t.cast(t.Dict[str, t.Any], row)))
        id = str(row['text_id'])

        tokenizer_result = self._tokenizer.tokenize(
            tokenizer_input, max_len=self._max_len)
        target_tensor = torch.tensor(
            [float(row[target]) for target in self._target_list],
            dtype=torch.float32)

        return DatasetItem(
            id=[id],
            x=tokenizer_result,
            y=target_tensor)

In [None]:
TARGET_LIST = [
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]

SEED = 42
NUM_FOLDS = 5
DEVICE = CUDADevice()
BACKBONE = 'microsoft/deberta-v3-small'
MAX_LEN = 1024
ENCODER_LR = 1e-5
DECODER_LR = 1e-4
BATCH_SIZE = 2
ACCUMULATE_GRADIENT_STEPS = 1
NUM_EPOCHS = 3
NUM_WORKERS = 2

ROOT_DIR = Path('/kaggle')
DATA_DIR = ROOT_DIR / 'data'
FP_ELL_DATASET_DIR = DATA_DIR / 'fp-ell'
MODEL_DIR = ROOT_DIR / 'models'
OOF_DIR = ROOT_DIR / 'oof'

RUN_ID = 'v1-test'
MODEL_PATH_TEMPLATE = MODEL_DIR / f'{RUN_ID}-fold_{{fold}}.pt'
OOF_PATH = OOF_DIR / f'{RUN_ID}.csv'

In [None]:
def _read_data(dataset_dir_path: Path, target_list: t.List[str], num_folds: int, seed: int) -> pd.DataFrame:
    all_df = pd.read_csv(dataset_dir_path / 'train.csv')
    target_arr = contiguous_to_categorical(all_df[target_list].values)

    mskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold_, (_, v_) in enumerate(mskf.split(X=all_df, y=target_arr)):
        all_df.loc[v_, 'fold'] = fold_

    return all_df

all_df = _read_data(
    dataset_dir_path=FP_ELL_DATASET_DIR,
    target_list=TARGET_LIST,
    num_folds=NUM_FOLDS,
    seed=SEED)

analyze_val_strategy(all_df, target_list=TARGET_LIST, num_folds=NUM_FOLDS)

In [None]:
def _train_model(fold: int) -> t.Tuple[float, OOFPredDict]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE, zero_out_dropout=True)
    tokenizer = get_tokenizer_for_backbone(backbone=BACKBONE)
    model: StandardModel[TokenizerResult] = StandardModel(
        backbone=backbone,
        pooler=ClsTokenPooler(),
        squeezer=TakeNthHiddenLayerOutputSqueezer(),
        dnn=torch.nn.Sequential(
            torch.nn.Linear(backbone.out_dim_size, len(TARGET_LIST))))
    optimizer = create_nakama_optimizer(
        model=model,
        encoder_lr=ENCODER_LR,
        decoder_lr=DECODER_LR)
    trainer: FullCycleTrainer[TokenizerResult] = FullCycleTrainer(
        iteration_trainer=StandardIterationTrainer(
            model=model,
            criterion=MSELoss(),
            optimizer=optimizer,
            scheduler=create_cosine_scheduler_with_warmup(
                optimizer=optimizer,
                num_training_steps=1,
                warmup_steps_ratio=0.0,
                num_cycles=0.5),
            pred_quality_metric_list=[
                MCRMSEMetric(),
            ],
            device=DEVICE,
            accumulate_gradient_steps=ACCUMULATE_GRADIENT_STEPS),
        batch_size=BATCH_SIZE,
        collator=DatasetItemCollator(x_collate_fn=tokenizer.result_type.collate_fn),
        num_epochs=NUM_EPOCHS,
        num_workers=NUM_WORKERS,
        model_comparison_metric=MCRMSEMetric.valid_name,
        model_comparison_metric_criteria=MCRMSEMetric.criteria,
        save_model_to_path=format_path(MODEL_PATH_TEMPLATE, fold=str(fold)),
        logger_list=[
            StdOutLogger(),
        ])

    train_df, valid_df = all_df[all_df['fold'] != fold], all_df[all_df['fold'] == fold]

    train_dataset = Dataset(
        df=train_df,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        target_list=TARGET_LIST)
    valid_dataset = Dataset(
        df=valid_df,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        target_list=TARGET_LIST)

    return trainer.do_full_cycle(train_dataset, valid_dataset)

score_list, oof_pred_dict = train_kfold_model(
    train_model_fn=_train_model,
    fold_list=list(range(NUM_FOLDS)))
oof_pred_dict.save_to_csv(
    OOF_PATH,
    score_col_name_list=[f'{target}_score' for target in TARGET_LIST])