### Imports

In [None]:
from __future__ import annotations

import functools
import operator
import random
import statistics
import typing as t

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as torch_f
import typing_extensions as t_ext
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers.models.auto.modeling_auto import AutoModel
from transformers.models.auto.tokenization_auto import AutoTokenizer

### Seed

In [None]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(42)

### Datasets

In [None]:
class _TokenizedText(t_ext.TypedDict):
    input_ids: torch.Tensor
    attention_mask: torch.Tensor


def _preprocess_tokenizer_output(output: t.Dict[str, t.Any]) -> _TokenizedText:
    return {
        'input_ids': torch.tensor(output['input_ids']),
        'attention_mask': torch.tensor(output['attention_mask']),
    }


def _split_str_to_chunk_list(s: str, chunk_size: int) -> t.List[str]:
    chunk_list = []
    chunk = []
    for token in s.split(' '):
        chunk.append(token)
        if len(chunk) >= chunk_size:
            chunk_list.append(' '.join(chunk))
            chunk.clear()
    if chunk:
        chunk_list.append(' '.join(chunk))
    return chunk_list


def predict_collate_fn(
        sample_list: t.List[t.Tuple[str, _TokenizedText]]
        ) -> t.Tuple[t.List[str], _TokenizedText, t.List[slice]]:
    curr_pos = 0

    idx_list: t.List[str] = []
    input_ids_list = []
    attention_mask_list = []
    slice_list: t.List[slice] = []
    
    for sample in sample_list:
        idx_list.append(sample[0])
        input_ids, attention_mask = sample[1]['input_ids'], sample[1]['attention_mask']
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        slice_list.append(slice(curr_pos, curr_pos + input_ids.shape[0]))
        curr_pos += input_ids.shape[0]

    tokenized_collated: _TokenizedText = {
        'input_ids': torch.cat(input_ids_list, dim=0),
        'attention_mask': torch.cat(attention_mask_list, dim=0),
    }

    return idx_list, tokenized_collated, slice_list


class PredictDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_len: int) -> None:
        super().__init__()
        self._df = df
        self._tokenizer = tokenizer
        self._max_len = max_len

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> t.Tuple[str, _TokenizedText]:
        record = self._df.iloc[idx]
        comment_id, text = str(record['comment_id']), str(record['text'])

        input_ids_list, attention_mask_list = [], []
        for chunk in _split_str_to_chunk_list(text, chunk_size=self._max_len):
            tokenized_chunk = _preprocess_tokenizer_output(self._tokenizer(
                chunk,
                add_special_tokens=True,
                truncation=True,
                padding='max_length',
                max_length=self._max_len,
                return_attention_mask=True))  # type: ignore
            input_ids_list.append(tokenized_chunk['input_ids'])
            attention_mask_list.append(tokenized_chunk['attention_mask'])

        tokenized_text: _TokenizedText = {
            'input_ids': torch.stack(input_ids_list, dim=0),
            'attention_mask': torch.stack(attention_mask_list, dim=0),
        }

        return comment_id, tokenized_text

### Models

#### Base model

In [None]:
class Model(torch.nn.Module):

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError()


class ModelConfig(t.NamedTuple):
    name: str
    model: Model
    tokenizer: AutoTokenizer


def import_checkpoint(model: torch.nn.Module, checkpoint: str, device: str):
    model.load_state_dict(torch.load(checkpoint, map_location=device))

#### CCC 2017

In [None]:
class _WeightedAverageLinearRegressor(torch.nn.Linear):

    def __init__(self, in_features: int, device: t.Optional[str] = None, dtype: t.Optional[str] = None):
        super().__init__(in_features=in_features, out_features=1, bias=False, device=device, dtype=dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch_f.linear(x, torch_f.softmax(self.weight, dim=1), self.bias)


class _CCC2017Model(Model):
    """
    ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1
    """

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.classifier = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes))
        self.regressor = _WeightedAverageLinearRegressor(in_features=num_classes)

    def forward_scores(self, label_preds: torch.Tensor) -> torch.Tensor:
        return self.regressor(label_preds)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        label_preds = self.classifier(pooled_output)
        scores = self.forward_scores(torch.sigmoid(label_preds))
        return label_preds, scores

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ccc2017(device: str) -> ModelConfig:
    model = _CCC2017Model('roberta-base', 768, 6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1.pt', device=device)
    return ModelConfig(
        name='ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

#### UBTC

In [None]:
class _UBTCModel(Model):
    """
    ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42
    """

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.feature_regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes),
            torch.nn.Sigmoid())
        self.score_regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(num_classes, 1),
            torch.nn.Sigmoid())

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        features = self.feature_regressor(pooled_output)
        return features, self.score_regressor(features)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ubtc(device: str) -> ModelConfig:
    model = _UBTCModel('unitary/unbiased-toxic-roberta', 768, 7)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42.pt', device=device)
    return ModelConfig(
        name='ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

#### Ruddit

In [None]:
class _RudditModel(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_ruddit(device: str) -> ModelConfig:
    model = _RudditModel('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ruddit-v3-mse-2ep-pure_reg.pt', device=device)
    return ModelConfig(
        name='ruddit-v3-mse-2ep-pure_reg',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

#### Offenseval 2020

In [None]:
class _OffensevalModel(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_offenseval(device: str) -> ModelConfig:
    model = _OffensevalModel('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5-backbone_utr.pt', device=device)
    return ModelConfig(
        name='offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5-backbone_utr',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

In [None]:
def do_predict_iteration(
        data_loader: DataLoader,
        model: Model,
        model_name: str,
        device: str) -> torch.Tensor:
    model.eval()
    score_list = []
    with torch.no_grad():
        it = tqdm(data_loader, desc=model_name)
        for _, tokenized_text, slice_list in it:
            score_tensor = model.predict_scores(
                tokenized_text['input_ids'].to(device),
                tokenized_text['attention_mask'].to(device),)
            score_tensor = torch.cat([torch.max(score_tensor[s], dim=0, keepdim=True)[0] for s in slice_list], dim=0)
            score_list.extend(score_tensor.cpu().flatten().tolist())
    return torch.tensor(score_list)

### Score generation

In [None]:
def predict_by_model(
        valid_df: pd.DataFrame,
        batch_size: int,
        model_getter: t.Callable[[str], ModelConfig],
        max_len: int,
        num_workers: int,
        device: str) -> torch.Tensor:
    model_config = model_getter(device)
    model = model_config.model.to(device)
    dataset = PredictDataset(
        df=valid_df,
        tokenizer=model_config.tokenizer,
        max_len=max_len)
    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=predict_collate_fn,  # type: ignore
        num_workers=num_workers,
        pin_memory=device.startswith('cuda'))
    return do_predict_iteration(data_loader=data_loader, model=model, model_name=model_config.name, device=device)

In [None]:
comments_to_score_df = t.cast(pd.DataFrame, pd.read_csv('/home/jovyan/jigsaw-toxic/data/jigsaw-toxic-severity-rating/comments_to_score.csv'))
comments_to_score_df = t.cast(pd.DataFrame, comments_to_score_df.sample(frac=1.0))

In [None]:
ccc2017_score_tensor = predict_by_model(
    valid_df=comments_to_score_df,
    batch_size=8,
    model_getter=load_ccc2017,
    num_workers=8,
    max_len=256,
    device='cuda')

In [None]:
plt.hist(ccc2017_score_tensor.tolist(), bins=50);

In [None]:
ubtc_score_tensor = predict_by_model(
    valid_df=comments_to_score_df,
    batch_size=8,
    model_getter=load_ubtc,
    num_workers=8,
    max_len=256,
    device='cuda')

In [None]:
plt.hist(ubtc_score_tensor.tolist(), bins=50);

In [None]:
ruddit_score_tensor = predict_by_model(
    valid_df=comments_to_score_df,
    batch_size=8,
    model_getter=load_ruddit,
    num_workers=8,
    max_len=256,
    device='cuda')

In [None]:
plt.hist(ruddit_score_tensor.tolist(), bins=50);

In [None]:
# offenseval_score_tensor = predict_by_model(
#     valid_df=comments_to_score_df,
#     batch_size=8,
#     model_getter=load_offenseval,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

In [None]:
ccc2017_score_arr, ubtc_score_arr, ruddit_score_arr = ccc2017_score_tensor.numpy(), ubtc_score_tensor.numpy(), ruddit_score_tensor.numpy()

In [None]:
def split_to_k_folds(df: pd.DataFrame, k: int) -> pd.DataFrame:
    df = t.cast(pd.DataFrame, df.copy())
    df['fold'] = 0
    n_per_fold = len(df) // k
    for i in range(k):
        df.loc[df.index.isin(range(i * n_per_fold, (i + 1) * n_per_fold)), 'fold'] = i
    return df

In [None]:
comments_to_score_5fold_df = split_to_k_folds(comments_to_score_df, k=5)

In [None]:
plt.hist(comments_to_score_5fold_df['fold'], bins=5);

In [None]:
def mine_pairs_folded(df: pd.DataFrame, score_arr_list: t.List[t.Tuple[str, np.ndarray]], max_pairs_per_sample: int) -> pd.DataFrame:
    df = t.cast(pd.DataFrame, df.copy())
    score_col_list = []
    for score_col, score_arr in score_arr_list:
        df[score_col] = score_arr
        score_col_list.append(score_col)
    pair_row_list: t.List[t.Dict[str, t.Union[str, float]]] = []
    it = tqdm(df.iterrows(), total=len(df))
    for i, row in it:
        less_text = str(row['text'])
        fold = int(row['fold'])
        more_mask = functools.reduce(operator.iand, [df[score_col] > row[score_col] for score_col in score_col_list], df.index != i)
        more_candidate_mask = (df['fold'] == fold) & more_mask
        more_candidate_df = df[more_candidate_mask]
        if len(more_candidate_df) == 0:
            continue
        
        for _, more_row in more_candidate_df.sample(n=min(len(more_candidate_df), max_pairs_per_sample)).iterrows():
            more_text = str(more_row['text'])
            score_diff_list = [more_row[score_col] - row[score_col] for score_col in score_col_list]
            score_diff_mean = statistics.mean(score_diff_list)
            pair_row_list.append({
                'less_toxic': less_text,
                'more_toxic': more_text,
                'fold': fold,
                'score_diff_mean': score_diff_mean,
            })
        it.set_description(f'Pairs generated: {len(pair_row_list)}.')
    return pd.DataFrame(pair_row_list)

In [None]:
pair_df = mine_pairs_folded(
    comments_to_score_5fold_df,
    [
        ('ccc2017', ccc2017_score_arr),
        ('ubtc', ubtc_score_arr),
        ('ruddit', ruddit_score_arr),
    ],
    max_pairs_per_sample=3)

In [None]:
pair_df

In [None]:
plt.hist(pair_df['fold'], bins=5);

In [None]:
plt.hist(pair_df['score_diff_mean'], bins=50);

In [None]:
pair_df.to_csv('/home/jovyan/jigsaw-toxic/data/datasets/jigsaw-2021-kfold/pseudo_labeled_5fold.csv', index=False)