### Imports

In [1]:
from __future__ import annotations

import numpy as np
import random
import typing as t

import pandas as pd
import torch
import torch.nn.functional as torch_f
import typing_extensions as t_ext
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers.models.auto.modeling_auto import AutoModel
from transformers.models.auto.tokenization_auto import AutoTokenizer

### Seed

In [2]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(42)

### Datasets

#### Old dataset

In [3]:
class _TokenizedText(t_ext.TypedDict):
    input_ids: torch.Tensor
    attention_mask: torch.Tensor


def _preprocess_tokenizer_output(output: t.Dict[str, t.Any]) -> _TokenizedText:
    return {
        'input_ids': torch.tensor(output['input_ids']),
        'attention_mask': torch.tensor(output['attention_mask']),
    }


def _split_str_to_chunk_list(s: str, chunk_size: int) -> t.List[str]:
    chunk_list = []
    chunk = []
    for token in s.split(' '):
        chunk.append(token)
        if len(chunk) >= chunk_size:
            chunk_list.append(' '.join(chunk))
            chunk.clear()
    if chunk:
        chunk_list.append(' '.join(chunk))
    return chunk_list


def valid_collate_fn(
        sample_list: t.List[t.Tuple[int, _TokenizedText, _TokenizedText]]
        ) -> t.Tuple[t.List[int], _TokenizedText, _TokenizedText, t.List[slice], t.List[slice]]:
    curr_pos_more, curr_pos_less = 0, 0

    idx_list: t.List[int] = []
    more_input_ids_list, less_input_ids_list = [], []
    more_attention_mask_list, less_attention_mask_list = [], []
    more_slice_list: t.List[slice] = []
    less_slice_list: t.List[slice] = []
    
    for sample in sample_list:
        idx_list.append(sample[0])
        more_input_ids, more_attention_mask = sample[1]['input_ids'], sample[1]['attention_mask']
        less_input_ids, less_attention_mask = sample[2]['input_ids'], sample[2]['attention_mask']
        more_input_ids_list.append(more_input_ids)
        less_input_ids_list.append(less_input_ids)
        more_attention_mask_list.append(more_attention_mask)
        less_attention_mask_list.append(less_attention_mask)
        more_slice_list.append(slice(curr_pos_more, curr_pos_more + more_input_ids.shape[0]))
        curr_pos_more += more_input_ids.shape[0]
        less_slice_list.append(slice(curr_pos_less, curr_pos_less + less_input_ids.shape[0]))
        curr_pos_less += less_input_ids.shape[0]

    more_tokenized_collated: _TokenizedText = {
        'input_ids': torch.cat(more_input_ids_list, dim=0),
        'attention_mask': torch.cat(more_attention_mask_list, dim=0),
    }
    less_tokenized_collated: _TokenizedText = {
        'input_ids': torch.cat(less_input_ids_list, dim=0),
        'attention_mask': torch.cat(less_attention_mask_list, dim=0),
    }

    return idx_list, more_tokenized_collated, less_tokenized_collated, more_slice_list, less_slice_list


class ValidDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_len: int) -> None:
        super().__init__()
        self._df = df
        self._tokenizer = tokenizer
        self._max_len = max_len

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> t.Tuple[int, _TokenizedText, _TokenizedText]:
        record = self._df.iloc[idx]
        text_more = str(record['more_toxic'])
        text_less = str(record['less_toxic'])

        more_input_ids_list, less_input_ids_list = [], []
        more_attention_mask_list, less_attention_mask_list = [], []
        for chunk in _split_str_to_chunk_list(text_more, chunk_size=self._max_len):
            tokenized_chunk = _preprocess_tokenizer_output(self._tokenizer(
                chunk,
                add_special_tokens=True,
                truncation=True,
                padding='max_length',
                max_length=self._max_len,
                return_attention_mask=True))  # type: ignore
            more_input_ids_list.append(tokenized_chunk['input_ids'])
            more_attention_mask_list.append(tokenized_chunk['attention_mask'])
        for chunk in _split_str_to_chunk_list(text_less, chunk_size=self._max_len):
            tokenized_chunk = _preprocess_tokenizer_output(self._tokenizer(
                chunk,
                add_special_tokens=True,
                truncation=True,
                padding='max_length',
                max_length=self._max_len,
                return_attention_mask=True))  # type: ignore
            less_input_ids_list.append(tokenized_chunk['input_ids'])
            less_attention_mask_list.append(tokenized_chunk['attention_mask'])

        tokenized_more: _TokenizedText = {
            'input_ids': torch.stack(more_input_ids_list, dim=0),
            'attention_mask': torch.stack(more_attention_mask_list, dim=0),
        }
        tokenized_less: _TokenizedText = {
            'input_ids': torch.stack(less_input_ids_list, dim=0),
            'attention_mask': torch.stack(less_attention_mask_list, dim=0),
        }

        return idx, tokenized_more, tokenized_less

#### New dataset

In [4]:
class _TokenizedText(t_ext.TypedDict):
    input_ids: torch.Tensor
    attention_mask: torch.Tensor


def _preprocess_tokenizer_output(output: t.Dict[str, t.Any]) -> _TokenizedText:
    return {
        'input_ids': torch.tensor(output['input_ids']),
        'attention_mask': torch.tensor(output['attention_mask']),
    }


def _split_str_to_chunk_list(s: str, chunk_size: int) -> t.List[str]:
    chunk_list = []
    chunk = []
    for token in s.split(' '):
        chunk.append(token)
        if len(chunk) >= chunk_size:
            chunk_list.append(' '.join(chunk))
            chunk.clear()
    if chunk:
        chunk_list.append(' '.join(chunk))
    return chunk_list


def valid_collate_fn(
        sample_list: t.List[t.Tuple[int, _TokenizedText, _TokenizedText]]
        ) -> t.Tuple[t.List[int], _TokenizedText, _TokenizedText, t.List[slice], t.List[slice]]:
    curr_pos_more, curr_pos_less = 0, 0

    idx_list: t.List[int] = []
    more_input_ids_list, less_input_ids_list = [], []
    more_attention_mask_list, less_attention_mask_list = [], []
    more_slice_list: t.List[slice] = []
    less_slice_list: t.List[slice] = []
    
    for sample in sample_list:
        idx_list.append(sample[0])
        more_input_ids, more_attention_mask = sample[1]['input_ids'], sample[1]['attention_mask']
        less_input_ids, less_attention_mask = sample[2]['input_ids'], sample[2]['attention_mask']
        more_input_ids_list.append(more_input_ids)
        less_input_ids_list.append(less_input_ids)
        more_attention_mask_list.append(more_attention_mask)
        less_attention_mask_list.append(less_attention_mask)
        more_slice_list.append(slice(curr_pos_more, curr_pos_more + more_input_ids.shape[0]))
        curr_pos_more += more_input_ids.shape[0]
        less_slice_list.append(slice(curr_pos_less, curr_pos_less + less_input_ids.shape[0]))
        curr_pos_less += less_input_ids.shape[0]

    more_tokenized_collated: _TokenizedText = {
        'input_ids': torch.cat(more_input_ids_list, dim=0),
        'attention_mask': torch.cat(more_attention_mask_list, dim=0),
    }
    less_tokenized_collated: _TokenizedText = {
        'input_ids': torch.cat(less_input_ids_list, dim=0),
        'attention_mask': torch.cat(less_attention_mask_list, dim=0),
    }

    return idx_list, more_tokenized_collated, less_tokenized_collated, more_slice_list, less_slice_list


class ValidDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_len: int) -> None:
        super().__init__()
        self._df = df
        self._tokenizer = tokenizer
        self._max_len = max_len

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> t.Tuple[int, _TokenizedText, _TokenizedText]:
        record = self._df.iloc[idx]
        text_more = str(record['more_toxic'])
        text_less = str(record['less_toxic'])

        more_input_ids_list, less_input_ids_list = [], []
        more_attention_mask_list, less_attention_mask_list = [], []
        for chunk in _split_str_to_chunk_list(text_more, chunk_size=self._max_len):
            tokenized_chunk = _preprocess_tokenizer_output(self._tokenizer(
                chunk,
                add_special_tokens=True,
                truncation=True,
                padding='max_length',
                max_length=self._max_len,
                return_attention_mask=True))  # type: ignore
            more_input_ids_list.append(tokenized_chunk['input_ids'])
            more_attention_mask_list.append(tokenized_chunk['attention_mask'])
        for chunk in _split_str_to_chunk_list(text_less, chunk_size=self._max_len):
            tokenized_chunk = _preprocess_tokenizer_output(self._tokenizer(
                chunk,
                add_special_tokens=True,
                truncation=True,
                padding='max_length',
                max_length=self._max_len,
                return_attention_mask=True))  # type: ignore
            less_input_ids_list.append(tokenized_chunk['input_ids'])
            less_attention_mask_list.append(tokenized_chunk['attention_mask'])

        tokenized_more: _TokenizedText = {
            'input_ids': torch.stack(more_input_ids_list, dim=0),
            'attention_mask': torch.stack(more_attention_mask_list, dim=0),
        }
        tokenized_less: _TokenizedText = {
            'input_ids': torch.stack(less_input_ids_list, dim=0),
            'attention_mask': torch.stack(less_attention_mask_list, dim=0),
        }

        return idx, tokenized_more, tokenized_less

### Models

#### Base model

In [5]:
class Model(torch.nn.Module):

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError()


class ModelConfig(t.NamedTuple):
    name: str
    model: Model
    tokenizer: AutoTokenizer


def import_checkpoint(model: torch.nn.Module, checkpoint: str, device: str):
    model.load_state_dict(torch.load(checkpoint, map_location=device))

#### CCC 2017

In [6]:
class _WeightedAverageLinearRegressor(torch.nn.Linear):

    def __init__(self, in_features: int, device: t.Optional[str] = None, dtype: t.Optional[str] = None):
        super().__init__(in_features=in_features, out_features=1, bias=False, device=device, dtype=dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch_f.linear(x, torch_f.softmax(self.weight, dim=1), self.bias)


class _CCC2017Model(Model):
    """
    ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1
    """

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.classifier = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes))
        self.regressor = _WeightedAverageLinearRegressor(in_features=num_classes)

    def forward_scores(self, label_preds: torch.Tensor) -> torch.Tensor:
        return self.regressor(label_preds)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        label_preds = self.classifier(pooled_output)
        scores = self.forward_scores(torch.sigmoid(label_preds))
        return label_preds, scores

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ccc2017(device: str) -> ModelConfig:
    model = _CCC2017Model('roberta-base', 768, 6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1.pt', device=device)
    return ModelConfig(
        name='ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

#### UBTC

In [7]:
class _UBTCModel(Model):
    """
    ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42
    """

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.feature_regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes),
            torch.nn.Sigmoid())
        self.score_regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(num_classes, 1),
            torch.nn.Sigmoid())

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        features = self.feature_regressor(pooled_output)
        return features, self.score_regressor(features)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ubtc(device: str) -> ModelConfig:
    model = _UBTCModel('unitary/unbiased-toxic-roberta', 768, 7)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42.pt', device=device)
    return ModelConfig(
        name='ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

#### Ruddit

In [8]:
class _RudditModel(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_ruddit(device: str) -> ModelConfig:
    model = _RudditModel('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ruddit-v3-mse-2ep-pure_reg.pt', device=device)
    return ModelConfig(
        name='ruddit-v3-mse-2ep-pure_reg',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

#### Offenseval 2020

In [9]:
class _OffensevalModel(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_offenseval(device: str) -> ModelConfig:
    model = _OffensevalModel('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5-backbone_utr.pt', device=device)
    return ModelConfig(
        name='offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5-backbone_utr',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

In [10]:
def do_valid_iteration(
        data_loader: DataLoader,
        model: Model,
        model_name: str,
        device: str) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model.eval()
    more_score_list, less_score_list = [], []
    with torch.no_grad():
        it = tqdm(data_loader, desc=model_name)
        for _, tokenized_text_more, tokenized_text_less, slice_list_more, slice_list_less in it:
            score_more = model.predict_scores(
                tokenized_text_more['input_ids'].to(device),
                tokenized_text_more['attention_mask'].to(device),)
            score_less = model.predict_scores(
                tokenized_text_less['input_ids'].to(device),
                tokenized_text_less['attention_mask'].to(device),)
            score_more = torch.cat([torch.max(score_more[s], dim=0, keepdim=True)[0] for s in slice_list_more], dim=0)
            score_less = torch.cat([torch.max(score_less[s], dim=0, keepdim=True)[0] for s in slice_list_less], dim=0)
            more_score_list.extend(score_more.cpu().flatten().tolist())
            less_score_list.extend(score_less.cpu().flatten().tolist())
                
    return torch.tensor(more_score_list), torch.tensor(less_score_list)


def ensemble_scores(score_list: t.List[torch.Tensor]) -> torch.Tensor:
    return sum(score_list) / len(score_list)

In [11]:
def predict_by_model(
        valid_df: pd.DataFrame,
        batch_size: int,
        model_getter: t.Callable[[str], ModelConfig],
        max_len: int,
        num_workers: int,
        device: str) -> t.Tuple[torch.Tensor, torch.Tensor]:
    model_config = model_getter(device)
    model = model_config.model.to(device)
    # model.load_state_dict(torch.load(model_config['tokenizer_checkpoint'], map_location=device))
    dataset = ValidDataset(
        df=valid_df,
        tokenizer=model_config.tokenizer,
        max_len=max_len)
    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=valid_collate_fn,  # type: ignore
        num_workers=num_workers,
        pin_memory=device.startswith('cuda'))
    return do_valid_iteration(data_loader=data_loader, model=model, model_name=model_config.name, device=device)

In [12]:
valid_df = pd.read_csv('/home/jovyan/jigsaw-toxic/data/jigsaw-toxic-severity-rating/valid.csv')

In [13]:
ccc2017_more, ccc2017_less = predict_by_model(
    valid_df=valid_df,
    batch_size=8,
    model_getter=load_ccc2017,
    num_workers=8,
    max_len=256,
    device='cuda')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1:   0%|          | 0/1264 [00:00<?, ?it/s]

In [15]:
ubtc_more, ubtc_less = predict_by_model(
    valid_df=valid_df,
    batch_size=8,
    model_getter=load_ubtc,
    num_workers=8,
    max_len=256,
    device='cuda')

Some weights of the model checkpoint at unitary/unbiased-toxic-roberta were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at unitary/unbiased-toxic-roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42:   0%|          | 0/1264 [00:00<?, ?it/s]

In [16]:
ruddit_more, ruddit_less = predict_by_model(
    valid_df=valid_df,
    batch_size=8,
    model_getter=load_ruddit,
    num_workers=8,
    max_len=256,
    device='cuda')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ruddit-v3-mse-2ep-pure_reg:   0%|          | 0/1264 [00:00<?, ?it/s]

In [17]:
offenseval_more, offenseval_less = predict_by_model(
    valid_df=valid_df,
    batch_size=8,
    model_getter=load_offenseval,
    num_workers=8,
    max_len=256,
    device='cuda')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5-backbone_utr:   0%|          | 0/1264 [00:00<?, ?it/s…

In [33]:
more_score = ensemble_scores([
    ccc2017_more,
    ubtc_more,
    ruddit_more,
    # offenseval_more,
])
less_score = ensemble_scores([
    ccc2017_less,
    ubtc_less,
    ruddit_less,
    # offenseval_less,
])
(more_score > less_score).float().mean()

tensor(0.7546)

In [21]:
(more_score > less_score).float().mean()

tensor(0.7546)