### Environment initialization

In [1]:
import os
import sys

if 'KAGGLE_URL_BASE' in os.environ:
    print('Running on Kaggle so initializing the environment...')

    os.environ['__KGLTBX_ENVIRONMENT'] = 'kaggle'
    sys.path.append('/kaggle/input/kaggle-toolbox')

Running on Kaggle so initializing the environment...


### Imports

In [2]:
import functools
import math
import os
import typing as t
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from kaggle_toolbox.data import DatasetItem, Movable
from kaggle_toolbox.device import CUDADevice
from kaggle_toolbox.ensembling import EnsemblingStrategy, MeanEnsemblingStrategy
from kaggle_toolbox.environment import Environment
from kaggle_toolbox.layers import SqueezeDim
from kaggle_toolbox.nlp.transformer import Backbone, Model as TransformerModel, StandardModel, \
    MeanPooler, AttentionHeadPooler, TakeNthSqueezer, ConcatSqueezer, get_tokenizer_for_backbone, \
    Tokenizer, TokenizerResult, TokenizerResultCollator, seed_everything
from kaggle_toolbox.path import format_path
from kaggle_toolbox.prediction import PredDict
from kaggle_toolbox.predictor import StandardPredictor
from kaggle_toolbox.progress import NotebookProgressBar
from kaggle_toolbox.typing import DynamicDict
from torch.utils.data import Dataset as TorchDataset, default_collate as default_collate_fn
from transformers import AutoConfig, AutoModel
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.utils.generic import PaddingStrategy
from transformers.utils.logging import set_verbosity_error as set_transformers_verbosity_error


NotebookProgressBar.attach_to_pandas()
set_transformers_verbosity_error()

### Collator

In [3]:
_X = t.TypeVar('_X', bound=Movable)

class DatasetItemCollator(t.Generic[_X]):

    def __init__(
            self,
            x_collate_fn: t.Callable[[t.List[_X]], _X],
            id_collate_fn: t.Callable[[t.List[t.List[str]]], t.List[str]] = default_collate_fn):
        self._x_collate_fn = x_collate_fn
        self._id_collate_fn = id_collate_fn

    def __call__(self, item_list: t.List[DatasetItem[_X]]) -> DatasetItem[_X]:
        return DatasetItem(
            id=self._id_collate_fn([item.id for item in item_list]),
            x=self._x_collate_fn([item.x for item in item_list]))

### Dataset

In [4]:
class Dataset(TorchDataset[DatasetItem[TokenizerResult]]):

    def __init__(
            self,
            df: pd.DataFrame,
            tokenizer: Tokenizer,
            max_len: int):
        self._df = df.copy().reset_index(drop=True)
        self._tokenizer = tokenizer
        self._max_len = max_len

    def _get_tokenizer_input(self, row: DynamicDict) -> str:
        (
            full_text,
         ) = (
            row.get_typed_or_raise('full_text', str),
         )

        return full_text

    def sort_by_tokenizer_input_len(self):
        self._df['_tok_input_len'] = self._df.progress_apply(
            lambda row: self._get_tokenizer_input(DynamicDict(t.cast(t.Dict[str, t.Any], row))), axis=1)
        self._df = self._df.sort_values('_tok_input_len')

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> DatasetItem[TokenizerResult]:
        row = self._df.iloc[idx]

        tokenizer_input = self._get_tokenizer_input(DynamicDict(t.cast(t.Dict[str, t.Any], row)))
        id = str(row['text_id'])

        tokenizer_result = self._tokenizer.tokenize(
            tokenizer_input, max_len=self._max_len)

        return DatasetItem(
            id=[id],
            x=tokenizer_result)

### Parameters

In [5]:
ENVIRONMENT = os.getenv('__KGLTBX_ENVIRONMENT', 'laptop')
_env = _env = Environment(ENVIRONMENT)

TARGET_LIST = [
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]
CLASSIFIER_TARGET_LIST = [
    # 'cohesion',
]

SEED = 42
NUM_FOLDS_ANDREI = 5
NUM_FOLDS_KOJ = 4
NUM_FOLDS_DIMA = 10
NUM_FOLDS_CLASSIFIER = 5
CLASSIFIER_NUM_CLASSES = 9
DEVICE = CUDADevice()
SINGLE_TARGET_BY_ANDREI_MAX_LEN = 1024
SINGLE_TARGET_BY_DIMA_MAX_LEN = 512
OWN_MULTI_TARGET_MAX_LEN = 1428
KOJ_MULTI_TARGET_MAX_LEN = 1428
CLASSIFIER_MAX_LEN = 512
BATCH_SIZE = _env.param(
    kaggle=8,
    # colab=4,
    laptop=2)
NUM_WORKERS = _env.param(kaggle=2, colab=2, laptop=4)

ROOT_DIR = _env.param(
    kaggle=Path('/kaggle'),
    laptop=Path('/kaggle'))
DATA_DIR = _env.param(
    kaggle=ROOT_DIR / 'input',
    laptop=ROOT_DIR / 'data')
MODEL_DIR = _env.param(
    kaggle=DATA_DIR,
    laptop=ROOT_DIR / 'models')
FP_ELL_DATASET_DIR = _env.param(
    kaggle=DATA_DIR / 'feedback-prize-english-language-learning',
    laptop=DATA_DIR / 'fp-ell')
INPUT_CSV_PATH = _env.param(
    kaggle=FP_ELL_DATASET_DIR / 'test.csv',
    laptop=FP_ELL_DATASET_DIR / 'train.csv')
OUTPUT_CSV_PATH = _env.param(
    kaggle=ROOT_DIR / 'working/submission.csv',
    laptop=ROOT_DIR / 'submission/submission.csv')

BACKBONE_BASE = 'microsoft/deberta-v3-base'
BACKBONE_BASE_PATH = _env.param(
    kaggle=str(DATA_DIR / 'deberta-v3-base/deberta-v3-base'),
    laptop='microsoft/deberta-v3-base')
BACKBONE_LARGE = 'microsoft/deberta-v3-large'
BACKBONE_LARGE_PATH = _env.param(
    kaggle=str(DATA_DIR / 'deberta-v3-large/deberta-v3-large'),
    laptop='microsoft/deberta-v3-large')


In [6]:
print(f'GPU model: {DEVICE.get_name()}')

GPU model: tesla_p100_pcie_16gb


### Pinning the seed

In [7]:
seed_everything(seed=SEED)

### Data loading

In [8]:
def _read_data() -> pd.DataFrame:
    return pd.read_csv(INPUT_CSV_PATH)

all_df = _read_data()
all_df.head(3)

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


### Model builders

#### Single Target by Andrei

##### Cohesion

In [9]:
def _build_cohesion_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE_BASE_PATH, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
        pooler=AttentionHeadPooler(backbone.out_dim_size),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size),
            torch.nn.Linear(backbone.out_dim_size, 1),
            SqueezeDim(),
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-transformer-training-cohesion/cohesion-v1-layer_norm-ep_4-valfreq_0p25-pooler_att-full-fold_{fold}.pt'))
    return model

##### Syntax

In [10]:
def _build_syntax_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE_BASE_PATH, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
        pooler=MeanPooler(),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size),
            torch.nn.Linear(backbone.out_dim_size, 1),
            SqueezeDim(),
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-transformer-training-syntax/syntax-v1-layer_norm-ep_3-valfreq_0p25-full-fold_{fold}.pt'))
    return model

##### Vocabulary

In [11]:
def _build_vocabulary_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE_BASE_PATH, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
        pooler=MeanPooler(),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size),
            torch.nn.Linear(backbone.out_dim_size, 1),
            SqueezeDim(),
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-transformer-training-vocabulary/vocabulary-v1-layer_norm-ep_3-valfreq_0p25-std_init-fold_{fold}.pt'))
    return model

##### Phraseology

In [12]:
def _build_phraseology_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE_BASE_PATH, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
        pooler=MeanPooler(),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size),
            torch.nn.Linear(backbone.out_dim_size, 1),
            SqueezeDim(),
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-transformer-training-phraseology/phraseology-v1-layer_norm-ep_3-valfreq_0p25-std_init-full-fold_{fold}.pt'))
    return model

##### Grammar

In [13]:
def _build_grammar_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE_BASE_PATH, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=ConcatSqueezer([9, 10, 11, 12]),
        pooler=MeanPooler(),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size * 4),
            torch.nn.Linear(backbone.out_dim_size * 4, 1),
            SqueezeDim(),
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-transformer-training-grammar/grammar-v1-lnorm-ep_4-valfreq_0p25-sqzr_cat_9_to_12-full-fold_{fold}.pt'))
    return model

##### Conventions

In [14]:
def _build_conventions_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE_BASE_PATH, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
        pooler=MeanPooler(),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size),
            torch.nn.Linear(backbone.out_dim_size, 1),
            SqueezeDim(),
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-transformer-training-conventions/conventions-v1-layer_norm-ep_3-valfreq_0p25-full-fold_{fold}.pt'))
    return model

#### Single Target by Dima

In [15]:
class AttentionPool(torch.nn.Module):

    def __init__(self, hidden_size: int, dropout: float, out_size: int = 1):
        super(AttentionPool, self).__init__()

        self.attention = torch.nn.Sequential(
           torch.nn.Linear(hidden_size, hidden_size),
           torch.nn.LayerNorm(hidden_size),
           torch.nn.GELU(),
           torch.nn.Linear(hidden_size, 1))
        self.droupot = torch.nn.Dropout(p=dropout)  # if dropout else None
        self.to_score = torch.nn.Linear(hidden_size, out_size)

    def forward(self, last_hidden_state: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        w = self.attention(last_hidden_state).float()
        w[mask==0]=float('-inf')
        w = torch.softmax(w, 1)
        _output = torch.sum(w * last_hidden_state, dim=1)
        if self.droupot != 0.0:
            _output = self.droupot(_output)
        output = self.to_score(_output)        
        return output


class FeedBackModel(TransformerModel[TokenizerResult]):
    def __init__(self, config_checkpoint: str, dropout: float):
        super().__init__()
        config = torch.load(config_checkpoint)
        self.transformer = AutoModel.from_config(config=config)
        self.attention = AttentionPool(config.hidden_size, dropout)

    def forward(self, x: TokenizerResult) -> torch.Tensor:
        transformer_out = self.transformer(
            input_ids=x.input_ids,
            attention_mask = x.attention_mask).last_hidden_state
        output = self.attention(transformer_out, x.attention_mask)
        return output


class SingleTargetByDimaModelBuilder:

    def __init__(
            self,
            config_checkpoint: str,
            dropout: float,
            checkpoint_path_template: Path) -> None:
        self._config_checkpoint = config_checkpoint
        self._dropout = dropout
        self._checkpoint_path_template = checkpoint_path_template

    def __call__(self, fold: int) -> TransformerModel[TokenizerResult]:
        model = FeedBackModel(config_checkpoint=self._config_checkpoint, dropout=self._dropout)
        model.load_state_dict(torch.load(
            format_path(self._checkpoint_path_template, fold=str(fold)))['model'])
        return model

#### KOJ Multi Target Model builder

In [16]:
class AttentionPooling(torch.nn.Module):

    def __init__(self, in_dim: int):
        super().__init__()
        self.attention = torch.nn.Sequential(
            torch.nn.Linear(in_dim, in_dim),
            torch.nn.LayerNorm(in_dim),
            torch.nn.GELU(),
            torch.nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state: torch.Tensor, attention_mask: torch.Tensor):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings


class KOJMultiTargetModel(TransformerModel[TokenizerResult]):
    def __init__(self, backbone_checkpoint: str):
        super().__init__()
        config = AutoConfig.from_pretrained(backbone_checkpoint)
        self.model = AutoModel.from_pretrained(backbone_checkpoint, config=config)
        self.pool = AttentionPooling(config.hidden_size)
        self.fc = torch.nn.Linear(config.hidden_size, len(TARGET_LIST))

    def feature(self, x: TokenizerResult) -> torch.Tensor:
        outputs = self.model(**x.tensor_dict)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, x.attention_mask)
        return feature
    
    def forward(self, x: TokenizerResult) -> torch.Tensor:
        feature = self.feature(x)
        outout = self.fc(feature)
        return outout


def _build_koj_multi_target_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    model = KOJMultiTargetModel(backbone_checkpoint=BACKBONE_BASE_PATH)
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-koj-awp-v3-base/microsoft-deberta-v3-base_fold{fold}_best.pth')['model'])
    return model

#### Own Multi Target Model builder

In [17]:
def _build_own_multi_target_fold_model(fold: int) -> TransformerModel[TokenizerResult]:
    backbone = Backbone.from_huggingface_checkpoint(BACKBONE_BASE_PATH, zero_out_dropout=True)
    model = StandardModel(
        backbone=backbone,
        squeezer=TakeNthSqueezer(),
#         pooler=AttentionHeadPooler(backbone.out_dim_size),
        pooler=MeanPooler(),
        dnn=torch.nn.Sequential(
            torch.nn.LayerNorm(backbone.out_dim_size),
            torch.nn.Linear(backbone.out_dim_size, len(TARGET_LIST))
        ))
    model.load_state_dict(torch.load(
        MODEL_DIR / f'fp-ell-models-multi-target/v1-layer_norm-5fold-fold_{fold}.pt'))
    return model

#### Classifier

In [18]:
class ClassifierModel(TransformerModel[TokenizerResult]):

    def __init__(
            self,
            config_checkpoint: str,
            num_classes: int,
            dropout: float):
        super().__init__()
        self.num_classes = num_classes
        config = torch.load(config_checkpoint)
        config.output_hidden_states = True
        self.transformer = AutoModel.from_config(config=config)
        self.attention = AttentionPool(
            config.hidden_size,
            dropout,
            self.num_classes)

    def forward(self, x: TokenizerResult) -> torch.Tensor:
        transformer_out = self.transformer(
                                           input_ids=x.input_ids.long(),
                                           attention_mask=x.attention_mask.long())
        transformer_out = transformer_out.last_hidden_state
        output = self.attention(transformer_out, x.attention_mask.long())
        return output.view(-1, self.num_classes)


class ClassifierModelBuilder:

    def __init__(
            self,
            config_checkpoint: str,
            dropout: float,
            checkpoint_path_template: Path,
            num_classes: int) -> None:
        self._config_checkpoint = config_checkpoint
        self._dropout = dropout
        self._checkpoint_path_template = checkpoint_path_template
        self._num_classes = num_classes

    def __call__(self, fold: int) -> TransformerModel[TokenizerResult]:
        model = ClassifierModel(
            config_checkpoint=self._config_checkpoint,
            dropout=self._dropout,
            num_classes=self._num_classes)
        model.load_state_dict(torch.load(
            format_path(self._checkpoint_path_template, fold=str(fold)))['model'])
        return model

### Prediction

#### LVL1

##### Utils

In [19]:
def _predict_by_model(
        df: pd.DataFrame,
        model_builder: t.Callable[[], TransformerModel[TokenizerResult]],
        max_len: int,
        tokenizer: Tokenizer) -> PredDict:
    predictor = StandardPredictor(
        model=model_builder(),
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        collator=DatasetItemCollator(
            id_collate_fn=lambda x: sum(x, []),
            x_collate_fn=TokenizerResultCollator(DataCollatorWithPadding(tokenizer.tokenizer))),
        device=DEVICE,
        progress_bar=NotebookProgressBar())

    dataset = Dataset(df, tokenizer=tokenizer, max_len=max_len)
    return predictor.predict(dataset)


def _predict(
        df: pd.DataFrame,
        model_builder_list: t.List[t.Callable[[], TransformerModel[TokenizerResult]]],
        ensembling_strategy: EnsemblingStrategy,
        max_len: int,
        tokenizer: Tokenizer) -> PredDict:
    return ensembling_strategy.ensemble([
        _predict_by_model(df=df, model_builder=model_builder, max_len=max_len, tokenizer=tokenizer)
        for model_builder in model_builder_list
    ])

##### Multi Target by Andrei

In [20]:
own_multi_target_df = pd.DataFrame([
    {'text_id': id, **{f'{k}_lvl1_score': v for k, v in zip(TARGET_LIST, score_list)}}
    for id, score_list in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_own_multi_target_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_ANDREI)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=OWN_MULTI_TARGET_MAX_LEN,
        tokenizer = get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)
    ).items()
])

own_multi_target_df.head(3)

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion_lvl1_score,syntax_lvl1_score,vocabulary_lvl1_score,phraseology_lvl1_score,grammar_lvl1_score,conventions_lvl1_score
0,0000C359D63E,2.971017,2.79666,3.169676,3.023754,2.735281,2.723406
1,000BAD50D026,2.60231,2.378233,2.697639,2.374195,2.18506,2.610459
2,00367BB2546B,3.522926,3.356434,3.579874,3.526404,3.42529,3.354217


##### Multi Target by KOJ

In [21]:
koj_multi_target_df = pd.DataFrame([
    {'text_id': id, **{f'{k}_lvl1_score': v for k, v in zip(TARGET_LIST, score_list)}}
    for id, score_list in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_koj_multi_target_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_KOJ)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=KOJ_MULTI_TARGET_MAX_LEN,
        tokenizer=get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)
    ).items()
])

koj_multi_target_df.head(3)

##### Single Target by Andrei

In [22]:
cohesion_df = pd.DataFrame([
    {'text_id': id, 'cohesion_lvl1_score': score}
    for id, (score,) in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_cohesion_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_ANDREI)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=SINGLE_TARGET_BY_ANDREI_MAX_LEN,
        tokenizer=get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)).items()
])
syntax_df = pd.DataFrame([
    {'text_id': id, 'syntax_lvl1_score': score}
    for id, (score,) in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_syntax_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_ANDREI)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=SINGLE_TARGET_BY_ANDREI_MAX_LEN,
        tokenizer=get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)).items()
])
vocabulary_df = pd.DataFrame([
    {'text_id': id, 'vocabulary_lvl1_score': score}
    for id, (score,) in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_vocabulary_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_ANDREI)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=SINGLE_TARGET_BY_ANDREI_MAX_LEN,
        tokenizer=get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)).items()
])
phraseology_df = pd.DataFrame([
    {'text_id': id, 'phraseology_lvl1_score': score}
    for id, (score,) in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_phraseology_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_ANDREI)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=SINGLE_TARGET_BY_ANDREI_MAX_LEN,
        tokenizer=get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)).items()
])
grammar_df = pd.DataFrame([
    {'text_id': id, 'grammar_lvl1_score': score}
    for id, (score,) in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_grammar_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_ANDREI)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=SINGLE_TARGET_BY_ANDREI_MAX_LEN,
        tokenizer=get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)).items()
])
conventions_df = pd.DataFrame([
    {'text_id': id, 'conventions_lvl1_score': score}
    for id, (score,) in _predict(
        df=all_df,
        model_builder_list=[
            functools.partial(_build_conventions_fold_model, fold=fold)
            for fold in range(NUM_FOLDS_ANDREI)
        ],
        ensembling_strategy=MeanEnsemblingStrategy(),
        max_len=SINGLE_TARGET_BY_ANDREI_MAX_LEN,
        tokenizer=get_tokenizer_for_backbone(
            backbone=BACKBONE_BASE,
            checkpoint=BACKBONE_BASE_PATH,
            padding_strategy=PaddingStrategy.DO_NOT_PAD)).items()
])

single_target_by_andrei_df = cohesion_df\
    .merge(syntax_df, left_on='text_id', right_on='text_id')\
    .merge(vocabulary_df, left_on='text_id', right_on='text_id')\
    .merge(phraseology_df, left_on='text_id', right_on='text_id')\
    .merge(grammar_df, left_on='text_id', right_on='text_id')\
    .merge(conventions_df, left_on='text_id', right_on='text_id')

single_target_by_andrei_df.head(3)

##### Single Target Base by Dima

In [23]:
single_target_base_by_dima_df = all_df[['text_id']].copy()
for target in TARGET_LIST:
    target_df = pd.DataFrame([
        {'text_id': id, f'{target}_lvl1_score': score}
        for id, (score,) in _predict(
            df=all_df,
            model_builder_list=[
                functools.partial(SingleTargetByDimaModelBuilder(
                    config_checkpoint=str(MODEL_DIR / 'v7-dbbase/config.pth'),
                    dropout=0.0,
                    checkpoint_path_template=MODEL_DIR / f'v7-dbbase-{target}/DBbS42F{{fold}}.pth'), fold)
                for fold in range(NUM_FOLDS_DIMA)
            ],
            ensembling_strategy=MeanEnsemblingStrategy(),
            max_len=SINGLE_TARGET_BY_DIMA_MAX_LEN,
            tokenizer=get_tokenizer_for_backbone(
                backbone=BACKBONE_BASE,
                checkpoint=str(MODEL_DIR / 'v7-dbbase/tokenizer'),
                padding_strategy=PaddingStrategy.MAX_LENGTH)).items()
    ])
    single_target_base_by_dima_df = single_target_base_by_dima_df\
        .merge(target_df, left_on='text_id', right_on='text_id')

single_target_base_by_dima_df.head(3)

##### Single Target Large by Dima

In [24]:
single_target_large_by_dima_df = all_df[['text_id']].copy()
for target in TARGET_LIST:
    target_df = pd.DataFrame([
        {'text_id': id, f'{target}_lvl1_score': score}
        for id, (score,) in _predict(
            df=all_df,
            model_builder_list=[
                functools.partial(SingleTargetByDimaModelBuilder(
                    config_checkpoint=str(MODEL_DIR / 'v2-dblarge/config.pth'),
                    dropout=0.2,
                    checkpoint_path_template=MODEL_DIR / f'v2-dblarge-{target}/DBlS42F{{fold}}.pth'), fold)
                for fold in range(NUM_FOLDS_DIMA)
            ],
            ensembling_strategy=MeanEnsemblingStrategy(),
            max_len=SINGLE_TARGET_BY_DIMA_MAX_LEN,
            tokenizer=get_tokenizer_for_backbone(
                backbone=BACKBONE_LARGE,
                checkpoint=str(MODEL_DIR / 'v2-dblarge/tokenizer'),
                padding_strategy=PaddingStrategy.MAX_LENGTH)).items()
    ])
    single_target_large_by_dima_df = single_target_large_by_dima_df\
        .merge(target_df, left_on='text_id', right_on='text_id')

single_target_large_by_dima_df.head(3)

##### Classifier

In [25]:
def _softmax(x_list: t.List[float]) -> t.List[float]:
    exp_x_list = [math.exp(x) for x in x_list]
    exp_x_sum = sum(exp_x_list)
    return [e / exp_x_sum for e in exp_x_list]


classifier_df = all_df[['text_id']].copy()
for target in CLASSIFIER_TARGET_LIST:
    target_df = pd.DataFrame([
        {
            'text_id': id,
            **{
                f'{target}_{c}_prob': val
                for c, val in zip(range(CLASSIFIER_NUM_CLASSES), _softmax(logit_list))
            }
        }
        for id, logit_list in _predict(
            df=all_df,
            model_builder_list=[
                functools.partial(ClassifierModelBuilder(
                    config_checkpoint=str(MODEL_DIR / 'dbbase-round/config.pth'),
                    dropout=0.2,
                    checkpoint_path_template=MODEL_DIR / f'dbbase-{target}-round/DBbS42F{{fold}}.pth',
                    num_classes=CLASSIFIER_NUM_CLASSES), fold)
                for fold in range(NUM_FOLDS_CLASSIFIER)
            ],
            ensembling_strategy=MeanEnsemblingStrategy(),
            max_len=CLASSIFIER_MAX_LEN,
            tokenizer=get_tokenizer_for_backbone(
                backbone=BACKBONE_BASE,
                checkpoint=str(MODEL_DIR / 'dbbase-round/tokenizer'),
                padding_strategy=PaddingStrategy.MAX_LENGTH)).items()
    ])
    classifier_df = classifier_df \
        .merge(target_df, left_on='text_id', right_on='text_id')

classifier_df.head(3)

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting.:   0%|          | 0/1 [00:00<?, ?it/s]

### Ensembling

In [44]:
def _ensemble_target(df_list: t.List[pd.DataFrame], target: str, weight_list: t.List[float]) -> np.ndarray:
    return np.stack(
        [
            t.cast(np.ndarray, (df[f'{target}_lvl1_score'] * w).values)
            for df, w in zip(df_list, weight_list)
        ], axis=0).sum(axis=0)


_pred_df_list = [
    single_target_base_by_dima_df,
    single_target_large_by_dima_df,
    single_target_by_andrei_df,
    own_multi_target_df,
    koj_multi_target_df,
]

ensembled_df = classifier_df.copy()
ensembled_df['cohesion'] = _ensemble_target(
    _pred_df_list,
    target='cohesion',
    weight_list=[0.1709, 0.3723, 0.3312, 0.05, 0.0764])
ensembled_df['syntax'] = _ensemble_target(
    _pred_df_list,
    target='syntax',
    # weight_list=[0.2227, 0.3260, 0.2428, 0.1266, 0.0811],
    weight_list=[0.25, 0.25, 0.25, 0.125, 0.125])
ensembled_df['vocabulary'] = _ensemble_target(
    _pred_df_list,
    target='vocabulary',
    weight_list=[0.25, 0.25, 0.25, 0.125, 0.125])
ensembled_df['phraseology'] = _ensemble_target(
    _pred_df_list,
    target='phraseology',
    weight_list=[0.25, 0.25, 0.25, 0.125, 0.125])
ensembled_df['grammar'] = _ensemble_target(
    _pred_df_list,
    target='grammar',
    weight_list=[0.1849, 0.4070, 0.2543, 0.0889, 0.0649])
ensembled_df['conventions'] = _ensemble_target(
    _pred_df_list,
    target='conventions',
    weight_list=[0.1812, 0.4367, 0.1944, 0.1143, 0.0722])

# ensembled_df = classifier_df\
#     .merge(own_multi_target_df, left_on='text_id', right_on='text_id')\
#     .rename({f'{target}_lvl1_score': target for target in TARGET_LIST}, axis=1).copy()

ensembled_df.head(3)

Unnamed: 0,text_id,cohesion_0_prob,cohesion_1_prob,cohesion_2_prob,cohesion_3_prob,cohesion_4_prob,cohesion_5_prob,cohesion_6_prob,cohesion_7_prob,cohesion_8_prob,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,0.001284,0.002429,0.09835,0.325039,0.36177,0.177779,0.031553,0.001098,0.000698,2.971017,2.79666,3.169676,3.023754,2.735281,2.723406
1,000BAD50D026,0.002746,0.007989,0.225343,0.431216,0.250128,0.073607,0.008219,0.00038,0.000373,2.60231,2.378233,2.697639,2.374195,2.18506,2.610459
2,00367BB2546B,0.001596,0.001426,0.011765,0.055138,0.180461,0.317616,0.351959,0.061899,0.018139,3.522926,3.356434,3.579874,3.526404,3.42529,3.354217


### Rounding

In [45]:
class _Rounder:

    def __init__(self, th: float, round_delta_lhs: float, round_delta_rhs: float):
        self._th = th
        self._round_delta_lhs = round_delta_lhs
        self._round_delta_rhs = round_delta_rhs

    def round(self, score: float, cls_prob_arr: np.ndarray) -> float:
        max_prob_idx = np.argmax(cls_prob_arr)
        cls_prob = cls_prob_arr[max_prob_idx]
        if cls_prob > self._th:
            preds_by_cls_score = (max_prob_idx / 2 + 1).item()
            if preds_by_cls_score - self._round_delta_lhs < score < preds_by_cls_score + self._round_delta_rhs:
                return preds_by_cls_score
        return score

_TARGET_TO_ROUNDER_DICT = {
#     'cohesion': _Rounder(th=0.35, round_delta_lhs=0.06, round_delta_rhs=0.07),
    'cohesion': _Rounder(th=0.374, round_delta_lhs=0.09, round_delta_rhs=0.084),
}

for target in CLASSIFIER_TARGET_LIST:
    rounder = _TARGET_TO_ROUNDER_DICT[target]
    ensembled_df[target] = ensembled_df.apply(
        lambda row: rounder.round(
            score=float(row[target]),
            cls_prob_arr=np.array([
                float(row[f'{target}_{c}_prob'])
                for c in range(CLASSIFIER_NUM_CLASSES)
            ])), axis=1)

In [46]:
pred_df = ensembled_df[['text_id', *TARGET_LIST]]

In [47]:
pred_df.head(3)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,2.79666,3.169676,3.023754,2.735281,2.723406
1,000BAD50D026,2.60231,2.378233,2.697639,2.374195,2.18506,2.610459
2,00367BB2546B,3.522926,3.356434,3.579874,3.526404,3.42529,3.354217


In [None]:
pred_df.to_csv(OUTPUT_CSV_PATH, index=False)