#### Environment initialization

In [None]:
import os
import sys

if 'KAGGLE_URL_BASE' in os.environ:
    print('Running on Kaggle so initializing the environment...')

    sys.path.append('/kaggle/input/kaggle-toolbox')

#### Imports

In [None]:
import functools
import os
import typing as t
from pathlib import Path

import pandas as pd
import torch
from kaggle_toolbox.data import DatasetItem, Movable
from kaggle_toolbox.device import CUDADevice
from kaggle_toolbox.ensembling import EnsemblingStrategy, MeanEnsemblingStrategy
from kaggle_toolbox.environment import Environment
from kaggle_toolbox.nlp.transformer import Backbone, Model as TransformerModel, StandardModel, \
    MeanPooler, TakeNthSqueezer, get_tokenizer_for_backbone, \
    Tokenizer, TokenizerResult, TokenizerResultCollator, seed_everything
from kaggle_toolbox.prediction import PredDict
from kaggle_toolbox.predictor import StandardPredictor
from kaggle_toolbox.progress import NotebookProgressBar
from kaggle_toolbox.typing import DynamicDict
from torch.utils.data import Dataset as TorchDataset, default_collate as default_collate_fn
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.utils.generic import PaddingStrategy
from transformers.utils.logging import set_verbosity_error as set_transformers_verbosity_error


NotebookProgressBar.attach_to_pandas()
set_transformers_verbosity_error()

#### Collator

In [None]:
_X = t.TypeVar('_X', bound=Movable)

class DatasetItemCollator(t.Generic[_X]):

    def __init__(
            self,
            x_collate_fn: t.Callable[[t.List[_X]], _X],
            id_collate_fn: t.Callable[[t.List[t.List[str]]], t.List[str]] = default_collate_fn):
        self._x_collate_fn = x_collate_fn
        self._id_collate_fn = id_collate_fn

    def __call__(self, item_list: t.List[DatasetItem[_X]]) -> DatasetItem[_X]:
        return DatasetItem(
            id=self._id_collate_fn([item.id for item in item_list]),
            x=self._x_collate_fn([item.x for item in item_list]))

#### Dataset

In [None]:
class Dataset(TorchDataset[DatasetItem[TokenizerResult]]):

    def __init__(
            self,
            df: pd.DataFrame,
            tokenizer: Tokenizer,
            max_len: int):
        self._df = df.copy().reset_index(drop=True)
        self._tokenizer = tokenizer
        self._max_len = max_len

    def _get_tokenizer_input(self, row: DynamicDict) -> str:
        (
            full_text,
         ) = (
            row.get_typed_or_raise('full_text', str),
         )

        return full_text

    def sort_by_tokenizer_input_len(self):
        self._df['_tok_input_len'] = self._df.progress_apply(
            lambda row: self._get_tokenizer_input(DynamicDict(t.cast(t.Dict[str, t.Any], row))), axis=1)
        self._df = self._df.sort_values('_tok_input_len')

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> DatasetItem[TokenizerResult]:
        row = self._df.iloc[idx]

        tokenizer_input = self._get_tokenizer_input(DynamicDict(t.cast(t.Dict[str, t.Any], row)))
        id = str(row['text_id'])

        tokenizer_result = self._tokenizer.tokenize(
            tokenizer_input, max_len=self._max_len)

        return DatasetItem(
            id=[id],
            x=tokenizer_result)

#### Parameters

In [None]:
ENVIRONMENT = os.getenv('__KGLTBX_ENVIRONMENT', 'laptop')
_env = _env = Environment(ENVIRONMENT)

TARGET_LIST = [
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]

SEED = 42
NUM_FOLDS = 5
DEVICE = CUDADevice()
MAX_LEN = 1024
BATCH_SIZE = _env.param(
    kaggle=8,
    # colab=4,
    laptop=2)
NUM_WORKERS = _env.param(kaggle=2, colab=2, laptop=4)

ROOT_DIR = _env.param(
    kaggle=Path('/kaggle'),
    laptop=Path('/kaggle'))
DATA_DIR = _env.param(
    kaggle=ROOT_DIR / 'input',
    laptop=ROOT_DIR / 'data')
MODEL_DIR = _env.param(
    kaggle=DATA_DIR,
    laptop=ROOT_DIR / 'models')
FP_ELL_DATASET_DIR = _env.param(
    kaggle=DATA_DIR / 'feedback-prize-english-language-learning',
    laptop=DATA_DIR / 'fp-ell')
INPUT_CSV_PATH = _env.param(
    kaggle=FP_ELL_DATASET_DIR / 'test.csv',
    laptop=FP_ELL_DATASET_DIR / 'train.csv')
OUTPUT_CSV_PATH = _env.param(
    kaggle=ROOT_DIR / 'working/submission.csv',
    laptop=ROOT_DIR / 'submission/submission.csv')

BACKBONE = _env.param(
    kaggle=str(DATA_DIR / 'deberta-v3-base'),
    laptop='microsoft/deberta-v3-base')


In [None]:
print(f'GPU model: {DEVICE.get_name()}')

#### Pinning the seed

In [None]:
seed_everything(seed=SEED)

#### Data loading

In [None]:
def _read_data() -> pd.DataFrame:
    return pd.read_csv(INPUT_CSV_PATH)

all_df = _read_data()
all_df.head(3)

#### Entrypoint

In [None]:
def _predict_by_model(df: pd.DataFrame, model_builder: t.Callable[[], TransformerModel[TokenizerResult]]) -> PredDict:
    tokenizer = get_tokenizer_for_backbone(backbone=BACKBONE, padding_strategy=PaddingStrategy.DO_NOT_PAD)
    predictor = StandardPredictor(
        model=model_builder(),
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        collator=DatasetItemCollator(
            id_collate_fn=lambda x: sum(x, []),
            x_collate_fn=TokenizerResultCollator(DataCollatorWithPadding(tokenizer.tokenizer))),
        device=DEVICE,
        progress_bar=NotebookProgressBar())

    dataset = Dataset(df, tokenizer=tokenizer, max_len=MAX_LEN)
    return predictor.predict(dataset)


def _predict(
        df: pd.DataFrame,
        model_builder_list: t.List[t.Callable[[], TransformerModel[TokenizerResult]]],
        ensembling_strategy: EnsemblingStrategy) -> PredDict:
    return ensembling_strategy.ensemble([
        _predict_by_model(df=df, model_builder=model_builder)
        for model_builder in model_builder_list
    ])


def _build_multi_target_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
        pooler=MeanPooler(),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size),
            torch.nn.Linear(backbone.out_dim_size, len(TARGET_LIST)),
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-transformer-training-multi-target/v1-layer_norm-5fold-fold_{fold}.pt'))
    return model


pred_df = pd.DataFrame([
    {'id': id, **{target: score for target, score in zip(TARGET_LIST, score_list)}}
    for id, score_list in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_multi_target_fold_model, fold=fold)
            for fold in range(NUM_FOLDS)
        ],
        ensembling_strategy=MeanEnsemblingStrategy()).items()
])
pred_df.to_csv(OUTPUT_CSV_PATH, index=False)

In [None]:
pred_df