##### Environment initialization

In [None]:
try:
    import os
    from kaggle_secrets import UserSecretsClient  # type: ignore

    secrets_client = UserSecretsClient()
    os.environ['GITHUB_TOKEN'] = secrets_client.get_secret('GITHUB_TOKEN')
    os.environ['WANDB_TOKEN'] = secrets_client.get_secret('WANDB_TOKEN')

    os.environ['__KGLTBX_INSTALL_FROM_GITHUB'] = '1'
    os.environ['__KGLTBX_ENVIRONMENT'] = 'kaggle'
except Exception:
    print('Kaggle initialization failed, probably not running on Kaggle...')

In [None]:
try:
    import os
    from google.colab import drive  # type: ignore

    drive.mount('/content/drive')

    with open('/content/drive/MyDrive/credentials/.env') as f:
        for line in f:
            line = line.strip()
            k, v, *_ = line.split('=')
            os.environ[k] = v

    os.environ['__KGLTBX_INSTALL_FROM_GITHUB'] = '1'
    os.environ['__KGLTBX_ENVIRONMENT'] = 'colab'
except Exception:
    print('Colab initialization failed, probably not running on Colab...')

#### Requirements

In [None]:
%%writefile requirements.txt

iterative-stratification==0.1.7

git+https://${GITHUB_TOKEN}@github.com/andrei-papou/kaggle-toolbox.git@rc-v0.1.9#egg=kaggle_toolbox[remote,wandb]

In [None]:
# Install requirements only when running on Kaggle.
!if [ "$__KGLTBX_INSTALL_FROM_GITHUB" == "1" ]; then pip install -r requirements.txt; fi
!rm requirements.txt

#### Imports

In [None]:
import itertools
import os
import typing as t
from pathlib import Path

import pandas as pd
import torch
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from kaggle_toolbox import device
from kaggle_toolbox.environment import Environment
from kaggle_toolbox.data import LabeledDatasetItem, DatasetItemCollator
from kaggle_toolbox.device import CUDADevice
from kaggle_toolbox.features.transform import contiguous_to_categorical
from kaggle_toolbox.iter import FixedSubsetIterPlannerBuilder, FracSubsetSize
from kaggle_toolbox.layers import SqueezeDim
from kaggle_toolbox.logging.stdout import StdOutLogger
from kaggle_toolbox.logging.wandb import WAndBLogger
from kaggle_toolbox.loss.regression import SmoothL1Loss
from kaggle_toolbox.lr_scheduling import create_cosine_scheduler_with_warmup
from kaggle_toolbox.metrics.regression import MCRMSEMetric
from kaggle_toolbox.model import Model
from kaggle_toolbox.nlp.transformer import Backbone, StandardModel, Squeezer, \
    MeanPooler, AttentionHeadPooler, TakeNthSqueezer, ConcatSqueezer, create_nakama_optimizer, \
    get_tokenizer_for_backbone, Tokenizer, TokenizerResult, TokenizerResultCollator, \
    seed_everything, standard_init_linear, standard_init_layer_norm, standard_init_module
from kaggle_toolbox.path import format_path
from kaggle_toolbox.prediction import PredDict
from kaggle_toolbox.progress import NotebookProgressBar
from kaggle_toolbox.trainer import StandardIterationTrainer, FullCycleTrainer, train_kfold_model
from kaggle_toolbox.typing import DynamicDict, filter_maybe_list
from kaggle_toolbox.validation import analyze_val_strategy, build_fold_result_df
from torch.optim import Optimizer
from torch.utils.data import Dataset as TorchDataset
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.optimization import AdamW
from transformers.utils.generic import PaddingStrategy
from transformers.utils.logging import set_verbosity_error as set_transformers_verbosity_error


NotebookProgressBar.attach_to_pandas()
set_transformers_verbosity_error()

#### Dataset

In [None]:
class Dataset(TorchDataset[LabeledDatasetItem[TokenizerResult]]):

    def __init__(
            self,
            df: pd.DataFrame,
            tokenizer: Tokenizer,
            max_len: int,
            target: str):
        self._df = df.copy().reset_index(drop=True)
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._target = target

    def _get_tokenizer_input(self, row: DynamicDict) -> str:
        (
            full_text,
         ) = (
            row.get_typed_or_raise('full_text', str),
         )

        return full_text

    def sort_by_tokenizer_input_len(self):
        self._df['_tok_input_len'] = self._df.progress_apply(
            lambda row: self._get_tokenizer_input(DynamicDict(t.cast(t.Dict[str, t.Any], row))), axis=1)
        self._df = self._df.sort_values('_tok_input_len')

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> LabeledDatasetItem[TokenizerResult]:
        row = self._df.iloc[idx]

        tokenizer_input = self._get_tokenizer_input(DynamicDict(t.cast(t.Dict[str, t.Any], row)))
        id = str(row['text_id'])

        tokenizer_result = self._tokenizer.tokenize(
            tokenizer_input, max_len=self._max_len)
        target_tensor = torch.tensor(
            float(row[self._target]),
            dtype=torch.float32)

        return LabeledDatasetItem(
            id=[id],
            x=tokenizer_result,
            y=target_tensor)

#### Optimizer

In [None]:
def create_llrd_optimizer(
        model: StandardModel[t.Any], 
        layerwise_lr: float,
        layerwise_weight_decay: float,
        layerwise_lr_decay: float,
        eps: float) -> torch.optim.Optimizer:
    no_decay = ["bias", "LayerNorm.weight"]
    # initialize lr for task specific layer
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.head_named_parameters if "model" not in n],
            "weight_decay": 0.0,
            "lr": layerwise_lr,
        },
    ]
    # initialize lrs for every layer
    layers = [model.backbone._inner.embeddings] + list(model.backbone._inner.encoder.layer)
    layers.reverse()
    lr = layerwise_lr
    for layer in layers:
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": layerwise_weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
        lr *= layerwise_lr_decay
    return AdamW(
        optimizer_grouped_parameters,
        lr=layerwise_lr,
        eps=eps,
        correct_bias=True)

#### FGM

In [None]:
class FGM:

    def __init__(self, model: Model[t.Any]):
        self._model = model
        self._backup = {}

    def attack(self, epsilon: float = 1., emb_name: str = 'word_embeddings'):
        for name, param in self._model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert param.grad is not None
                self._backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name: str = 'word_embeddings'):
        for name, param in self._model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self._backup
                param.data = self._backup[name]
            self._backup = {}

#### Iteration Trainer

In [None]:
class IterationTrainer(StandardIterationTrainer[TokenizerResult]):

    def __init__(
        self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._fgm = FGM(self._model)

    def _before_optimizer_step(self, x: TokenizerResult, y: torch.Tensor):
        self._fgm.attack()
        y_preds = self._model(x)
        loss_adv = self._criterion(y_preds, y)
        loss_adv.backward()
        self._fgm.restore()

#### Parameters

In [None]:
TARGET_LIST = [
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]
TARGET = TARGET_LIST[0]
ENVIRONMENT = os.getenv('__KGLTBX_ENVIRONMENT', 'laptop')
_env = Environment(ENVIRONMENT)

IS_KAGGLE = 'KAGGLE_URL_BASE' in os.environ
IS_PRERUN = False
SEED = 42
NUM_FOLDS = 5
FOLD_LIST = [0, 1, 2, 3, 4] if not IS_PRERUN else [0]
DEVICE = CUDADevice()
BACKBONE = 'microsoft/deberta-v3-base'
MAX_LEN = 1428
# ENCODER_LR = 1e-5
# DECODER_LR = 1e-4

LAYERWISE_LR = 5e-5
LAYERWISE_LR_DECAY = 0.9
LAYERWISE_WEIGHT_DECAY = 0.01
LAYERWISE_ADAM_EPS = 1e-6
MAX_GRAD_NORM = 1000.0

BATCH_SIZE = _env.param(kaggle=2, colab=2, laptop=1)
ACCUMULATE_GRADIENT_STEPS = _env.param(kaggle=4, colab=4, laptop=8)
NUM_EPOCHS = 4
VAL_FREQ = 0.25
NUM_WORKERS = _env.param(kaggle=2, colab=2, laptop=4)

ROOT_DIR = _env.param(
    kaggle=Path('/kaggle'),
    colab=Path('/content/drive/MyDrive'),
    laptop=Path('/kaggle'))
DATA_DIR = _env.param(
    kaggle=ROOT_DIR / 'input',
    colab=ROOT_DIR / 'data',
    laptop=ROOT_DIR / 'data')
FP_ELL_DATASET_DIR = _env.param(
    kaggle=DATA_DIR / 'feedback-prize-english-language-learning',
    colab=DATA_DIR / 'fp-ell',
    laptop=DATA_DIR / 'fp-ell')
MODEL_DIR = _env.param(
    kaggle=ROOT_DIR / 'working',
    colab=ROOT_DIR / 'models/fp-ell',
    laptop=ROOT_DIR / 'models')
OOF_DIR = _env.param(
    kaggle=ROOT_DIR / 'working',
    colab=ROOT_DIR / 'oof/fp-ell',
    laptop=ROOT_DIR / 'oof')

RUN_ID = f'{TARGET}-koj'
MODEL_PATH_TEMPLATE = _env.param(
    kaggle=MODEL_DIR / f'{RUN_ID}-fold_{{fold}}.pt',
    colab=None,
    laptop=None)
OOF_PATH = OOF_DIR / f'{RUN_ID}.csv'

In [None]:
print(f'GPU model: {DEVICE.get_name()}')

#### Pinning the seed

In [None]:
seed_everything(seed=SEED)

#### Data loading

In [None]:
def _read_data(dataset_dir_path: Path, target_list: t.List[str], num_folds: int, seed: int) -> pd.DataFrame:
    all_df = pd.read_csv(dataset_dir_path / 'train.csv')
    target_arr = contiguous_to_categorical(all_df[target_list].values)

    mskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold_, (_, v_) in enumerate(mskf.split(X=all_df, y=target_arr)):
        all_df.loc[v_, 'fold'] = fold_

    return all_df

all_df = _read_data(
    dataset_dir_path=FP_ELL_DATASET_DIR,
    target_list=TARGET_LIST,
    num_folds=NUM_FOLDS,
    seed=SEED)

analyze_val_strategy(all_df, target_list=TARGET_LIST, num_folds=NUM_FOLDS)

#### Entrypoint

In [None]:
def _train_model(fold: int) -> t.Tuple[float, PredDict]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE, zero_out_dropout=True)
    standard_init_module(backbone.inner.encoder.layer[-1])
    tokenizer = get_tokenizer_for_backbone(backbone=BACKBONE, padding_strategy=PaddingStrategy.DO_NOT_PAD)
    model: StandardModel[TokenizerResult] = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
        pooler=AttentionHeadPooler(backbone.out_dim_size),
        dnn=torch.nn.Sequential(
            # torch.nn.LayerNorm(backbone.out_dim_size),
            # torch.nn.Linear(backbone.out_dim_size, len(TARGET_LIST)),
            # standard_init_layer_norm(
            #     torch.nn.LayerNorm(backbone.out_dim_size)),
            standard_init_linear(
                torch.nn.Linear(backbone.out_dim_size, 1),
                std=backbone.initializer_range if backbone.initializer_range is not None else 0.02),
            SqueezeDim(),
        ))
    # optimizer = create_nakama_optimizer(
    #     model=model,
    #     encoder_lr=ENCODER_LR,
    #     decoder_lr=DECODER_LR)
    optimizer = create_llrd_optimizer(
        model=model,
        layerwise_lr=LAYERWISE_LR,
        layerwise_lr_decay=LAYERWISE_LR_DECAY,
        layerwise_weight_decay=LAYERWISE_WEIGHT_DECAY,
        eps=LAYERWISE_ADAM_EPS)

    train_df, valid_df = all_df[all_df['fold'] != fold], all_df[all_df['fold'] == fold]

    train_dataset = Dataset(
        df=train_df,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        target=TARGET)
    valid_dataset = Dataset(
        df=valid_df,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        target=TARGET)
    valid_dataset.sort_by_tokenizer_input_len()

    num_training_steps = (len(train_dataset) * NUM_EPOCHS) // (BATCH_SIZE * ACCUMULATE_GRADIENT_STEPS)

    trainer: FullCycleTrainer[TokenizerResult] = FullCycleTrainer(
        iteration_trainer=IterationTrainer(
            model=model,
            criterion=SmoothL1Loss(),
            optimizer=optimizer,
            scheduler=create_cosine_scheduler_with_warmup(
                optimizer=optimizer,
                num_training_steps=num_training_steps,
                warmup_steps_ratio=0.0,
                num_cycles=0.5),
            pred_quality_metric_list=[
                MCRMSEMetric(),
            ],
            device=DEVICE,
            max_grad_norm=MAX_GRAD_NORM,
            accumulate_gradient_steps=ACCUMULATE_GRADIENT_STEPS,
            progress_bar=NotebookProgressBar()),
        train_iter_planner_builder=FixedSubsetIterPlannerBuilder(FracSubsetSize(VAL_FREQ)),
        batch_size=BATCH_SIZE,
        collator=DatasetItemCollator(
            id_collate_fn=lambda x: sum(x, []),
            x_collate_fn=TokenizerResultCollator(DataCollatorWithPadding(tokenizer.tokenizer))),
        num_epochs=NUM_EPOCHS,
        num_workers=NUM_WORKERS,
        model_comparison_metric=MCRMSEMetric.valid_name(),
        model_comparison_metric_criteria=MCRMSEMetric.criteria,
        save_model_to_path=format_path(MODEL_PATH_TEMPLATE, fold=str(fold)) \
            if MODEL_PATH_TEMPLATE is not None else None,
        logger_list=filter_maybe_list([
            StdOutLogger() if not IS_PRERUN else None,
            WAndBLogger(
                user_name='andrei-papou',
                project='fp-ell',
                run_id=RUN_ID,
                metric_prefix=f'f{fold}'
            ) if not IS_PRERUN else None,
        ]))

    return trainer.do_full_cycle(train_dataset, valid_dataset)

score_list, oof_pred_dict = train_kfold_model(
    train_model_fn=_train_model,
    fold_list=FOLD_LIST)
oof_pred_dict.save_to_csv(
    OOF_PATH,
    score_col_name_list=[f'{TARGET}_score'])
build_fold_result_df(fold_list=FOLD_LIST, score_list=score_list)

#### Environment shutdown

In [None]:
if ENVIRONMENT == 'colab':
    from google.colab import runtime  # type: ignore

    runtime.unassign()