### Imports

In [1]:
from __future__ import annotations

import functools
import itertools
import random
import statistics
import typing as t

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as torch_f
import typing_extensions as t_ext
from textaugment import Wordnet
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers.models.auto.configuration_auto import AutoConfig
from transformers.models.auto.modeling_auto import AutoModel
from transformers.models.auto.tokenization_auto import AutoTokenizer
from scipy.stats import rankdata

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

### Seed

In [None]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(42)

### Datasets

In [None]:
class _TokenizedText(t_ext.TypedDict):
    input_ids: torch.Tensor
    attention_mask: torch.Tensor


def _preprocess_tokenizer_output(output: t.Dict[str, t.Any]) -> _TokenizedText:
    return {
        'input_ids': torch.tensor(output['input_ids']),
        'attention_mask': torch.tensor(output['attention_mask']),
    }


def _split_str_to_chunk_list(s: str, chunk_size: int) -> t.List[str]:
    chunk_list = []
    chunk = []
    for token in s.split(' '):
        chunk.append(token)
        if len(chunk) >= chunk_size:
            chunk_list.append(' '.join(chunk))
            chunk.clear()
    if chunk:
        chunk_list.append(' '.join(chunk))
    return chunk_list


def predict_collate_fn(
        sample_list: t.List[t.Tuple[str, _TokenizedText]]
        ) -> t.Tuple[t.List[str], _TokenizedText, t.List[slice]]:
    curr_pos = 0

    idx_list: t.List[str] = []
    input_ids_list = []
    attention_mask_list = []
    slice_list: t.List[slice] = []
    
    for sample in sample_list:
        idx_list.append(sample[0])
        input_ids, attention_mask = sample[1]['input_ids'], sample[1]['attention_mask']
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        slice_list.append(slice(curr_pos, curr_pos + input_ids.shape[0]))
        curr_pos += input_ids.shape[0]

    tokenized_collated: _TokenizedText = {
        'input_ids': torch.cat(input_ids_list, dim=0),
        'attention_mask': torch.cat(attention_mask_list, dim=0),
    }

    return idx_list, tokenized_collated, slice_list


class PredictDataset(Dataset):

    def __init__(
            self,
            df: pd.DataFrame,
            tokenizer: AutoTokenizer,
            max_len: int,
            augmentation_list: t.Optional[t.List[t.Callable[[str], str]]] = None) -> None:
        super().__init__()
        self._df = df
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._augmentation_list = augmentation_list if augmentation_list is not None else []

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> t.Tuple[str, _TokenizedText]:
        record = self._df.iloc[idx]
        comment_id, text = str(record['comment_id']), str(record['text'])

        for aug in self._augmentation_list:
            text = aug(text)

        input_ids_list, attention_mask_list = [], []
        for chunk in _split_str_to_chunk_list(text, chunk_size=self._max_len):
            tokenized_chunk = _preprocess_tokenizer_output(self._tokenizer(
                chunk,
                add_special_tokens=True,
                truncation=True,
                padding='max_length',
                max_length=self._max_len,
                return_attention_mask=True))  # type: ignore
            input_ids_list.append(tokenized_chunk['input_ids'])
            attention_mask_list.append(tokenized_chunk['attention_mask'])

        tokenized_text: _TokenizedText = {
            'input_ids': torch.stack(input_ids_list, dim=0),
            'attention_mask': torch.stack(attention_mask_list, dim=0),
        }

        return comment_id, tokenized_text


class NoChunksPredictDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_len: int) -> None:
        super().__init__()
        self._df = df
        self._tokenizer = tokenizer
        self._max_len = max_len

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> t.Tuple[str, _TokenizedText]:
        record = self._df.iloc[idx]
        comment_id, text = str(record['comment_id']), str(record['text'])

        tokenized_text = _preprocess_tokenizer_output(self._tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self._max_len,
            return_attention_mask=True))  # type: ignore

        return comment_id, tokenized_text

### Models

#### Base model

In [None]:
class Model(torch.nn.Module):

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError()


class ModelConfig(t.NamedTuple):
    name: str
    model: Model
    tokenizer: AutoTokenizer


def import_checkpoint(model: torch.nn.Module, checkpoint: str, device: str):
    state_dict = torch.load(checkpoint, map_location=device)
    # print('state dict keys:', state_dict.keys())
    model.load_state_dict(state_dict)

#### CCC 2017

##### ccc_2017_m1

In [None]:
class _AttentionRegressor(torch.nn.Module):

    def __init__(self, in_features: int) -> None:
        super().__init__()
        self.attention = torch.nn.Linear(in_features=in_features, out_features=in_features, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        weight = self.attention(x)
        return (x * torch_f.softmax(weight, dim=1)).sum(dim=1)


class _CCC2017M1Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super(Model, self).__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.blind_regressor = torch.nn.Sequential(
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 1),
            torch.nn.Sigmoid())
        self.classifier = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes))
        self.regressor = _AttentionRegressor(in_features=num_classes + 1)

    def forward_scores(self, blind_reg_output: torch.Tensor, label_preds: torch.Tensor) -> torch.Tensor:
        return self.regressor(torch.cat([torch.sigmoid(label_preds), blind_reg_output], dim=1))

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        blind_reg_output = self.blind_regressor(pooled_output)
        label_preds = self.classifier(pooled_output)
        scores = self.forward_scores(blind_reg_output, label_preds)
        return blind_reg_output, label_preds, scores

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[2]


def load_ccc2017_m1(device: str) -> ModelConfig:
    model = _CCC2017M1Model('roberta-base', 768, 6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ccc-2017-multilabel-v3-cls-att-blind-reg.pt', device=device)
    return ModelConfig(
        name='ccc-2017-multilabel-v3-cls-att-blind-reg',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

##### ccc_2017_m2

In [None]:
class _M2WeightedAverageLinearRegressor(torch.nn.Linear):

    def __init__(self, in_features: int, device: t.Optional[str] = None, dtype: t.Optional[str] = None):
        super().__init__(in_features=in_features, out_features=1, bias=False, device=device, dtype=dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch_f.linear(x, torch_f.softmax(self.weight, dim=1), self.bias)


class _CCC2017M2Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super(Model, self).__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.classifier = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes))
        self.regressor = _M2WeightedAverageLinearRegressor(in_features=num_classes)

    def forward_scores(self, label_preds: torch.Tensor) -> torch.Tensor:
        return self.regressor(label_preds)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        label_preds = self.classifier(pooled_output)
        scores = self.forward_scores(torch.sigmoid(label_preds))
        return label_preds, scores

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ccc2017_m2(device: str) -> ModelConfig:
    model = _CCC2017M2Model('roberta-base', 768, 6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ccc-2017-multilabel-harder-cls-loss_0p5-v2.pt', device=device)
    return ModelConfig(
        name='ccc-2017-multilabel-harder-cls-loss_0p5-v2',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

##### ccc_2017_m3

In [None]:
class _WeightedAverageLinearRegressor(torch.nn.Linear):

    def __init__(self, in_features: int, device: t.Optional[str] = None, dtype: t.Optional[str] = None):
        super().__init__(in_features=in_features, out_features=1, bias=False, device=device, dtype=dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch_f.linear(x, torch_f.softmax(self.weight, dim=1), self.bias)


class _CCC2017M3Model(Model):
    """
    ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1
    """

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.classifier = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes))
        self.regressor = _WeightedAverageLinearRegressor(in_features=num_classes)

    def forward_scores(self, label_preds: torch.Tensor) -> torch.Tensor:
        return self.regressor(label_preds)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        label_preds = self.classifier(pooled_output)
        scores = self.forward_scores(torch.sigmoid(label_preds))
        return label_preds, scores

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ccc2017_m3(device: str) -> ModelConfig:
    model = _CCC2017M3Model('roberta-base', 768, 6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1.pt', device=device)
    return ModelConfig(
        name='ccc-2017-multilabel-harder-cls-loss_0p5-v2-valfreq_dynamic_v1',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

##### ccc_2017_m4

In [None]:
class _CCC2017M4WeightedAverageLinearRegressor(torch.nn.Linear):

    def __init__(self, in_features: int, device: t.Optional[str] = None, dtype: t.Optional[str] = None):
        super().__init__(in_features=in_features, out_features=1, bias=False, device=device, dtype=dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch_f.linear(x, torch_f.softmax(self.weight, dim=1), self.bias)


class _CCC2017M4Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super(Model, self).__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.classifier = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes))
        self.regressor = _CCC2017M4WeightedAverageLinearRegressor(in_features=num_classes)

    def forward_scores(self, label_preds: torch.Tensor) -> torch.Tensor:
        return self.regressor(label_preds)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        label_preds = self.classifier(pooled_output)
        scores = self.forward_scores(torch.sigmoid(label_preds))
        return label_preds, scores

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ccc2017_m4(device: str) -> ModelConfig:
    model = _CCC2017M4Model('roberta-base', 768, 6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ccc-2017-multilabel-harder-cls-loss_0p5_pow4-margin_0p3-seed_42.pt', device=device)
    return ModelConfig(
        name='ccc-2017-multilabel-harder-cls-loss_0p5_pow4-margin_0p3-seed_42',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

#### UBTC

##### ubtc_m1

In [None]:
class _UBTCM1Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, num_classes: int):
        super(Model, self).__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.feature_regressor = torch.nn.Sequential(
            torch.nn.Linear(output_logits, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes),
            torch.nn.Sigmoid())
        self.score_regressor = torch.nn.Sequential(
            torch.nn.Linear(num_classes, 1),
            torch.nn.Sigmoid())

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        _, pooled_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask)
        features = self.feature_regressor(pooled_output)
        return features, self.score_regressor(features)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)[1]


def load_ubtc_m1(device: str) -> ModelConfig:
    model = _UBTCM1Model('unitary/unbiased-toxic-roberta', 768, 7)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-valfreq_dynamic_v1-seed_42.pt', device=device)
    return ModelConfig(
        name='ubtc-multireg-w50-cos_warmup-opt-2ep-ut_roberta-seed_42',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

##### ubtc_m2

In [None]:
class _UBTCM2Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super().__init__()

        config = AutoConfig.from_pretrained(checkpoint)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        self.roberta = AutoModel.from_pretrained(checkpoint, config=config)  
            
        self.attention = torch.nn.Sequential(            
            torch.nn.Linear(output_logits, 512),            
            torch.nn.Tanh(),                       
            torch.nn.Linear(512, 1),
            torch.nn.Softmax(dim=1)
        )        

        self.regressor = torch.nn.Sequential(                        
            torch.nn.Linear(output_logits, 1),
            torch.nn.Tanh(),                      
        )
        

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        roberta_output = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_ubtc_m2(device: str) -> ModelConfig:
    model = _UBTCM2Model('unitary/unbiased-toxic-roberta', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ubtc-pure_reg-w50-cos_warmup_opt-2ep-att-ut_roberta-seed_42.pt', device=device)
    return ModelConfig(
        name='ubtc-pure_reg-w50-cos_warmup_opt-2ep-att-ut_roberta-seed_42',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

#### Ruddit

##### ruddit_m1

In [None]:
class _RudditM1Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super().__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_ruddit_m1(device: str) -> ModelConfig:
    model = _RudditM1Model('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ruddit-v3-mse-2ep-pure_reg.pt', device=device)
    return ModelConfig(
        name='ruddit-v3-mse-2ep-pure_reg',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

##### ruddit_m2

In [None]:
class _RudditM2Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super().__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 256),
            torch.nn.Tanh(),
            torch.nn.Linear(256, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_ruddit_m2(device: str) -> ModelConfig:
    model = _RudditM2Model('unitary/unbiased-toxic-roberta', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/ruddit-v3-mse-2ep-pure_reg-unbiased_toxic_roberta-2layer_reg.pt', device=device)
    return ModelConfig(
        name='ruddit-v3-mse-2ep-pure_reg-unbiased_toxic_roberta-2layer_reg',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

#### Wiki Talk Labels

##### wiki_talk_labels_m1

In [None]:
class _WikiTalkLabelsM1Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            torch.nn.Linear(output_logits, 256),
            torch.nn.Tanh(),
            torch.nn.Linear(256, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_wiki_talk_labels_m1(device: str) -> ModelConfig:
    model = _WikiTalkLabelsM1Model('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/wiki-talk-labels-v1-1ep.pt', device=device)
    return ModelConfig(
        name='wiki-talk-labels-v1-1ep',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

##### wiki_talk_labels_m2

In [None]:
class _WikiTalkLabelsM2Model(Model):
    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super().__init__()

        config = AutoConfig.from_pretrained(checkpoint)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        self.roberta = AutoModel.from_pretrained(checkpoint, config=config)  
            
        self.attention = torch.nn.Sequential(            
            torch.nn.Linear(output_logits, 512),            
            torch.nn.Tanh(),                       
            torch.nn.Linear(512, 1),
            torch.nn.Softmax(dim=1)
        )        

        self.regressor = torch.nn.Sequential(                        
            torch.nn.Linear(output_logits, 1),
            torch.nn.Tanh(),                      
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        roberta_output = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_wiki_talk_labels_m2(device: str) -> ModelConfig:
    model = _WikiTalkLabelsM2Model('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/wiki-talk-labels-v1-1ep-att.pt', device=device)
    return ModelConfig(
        name='wiki-talk-labels-v1-1ep-att',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

#### Offenseval

##### offenseval_m1

In [None]:
class _OffensevalM1Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super().__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)
    
    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_offenseval_m1(device: str) -> ModelConfig:
    model = _OffensevalM1Model('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5.pt', device=device)
    return ModelConfig(
        name='offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

##### offenseval_m2

In [None]:
class _OffensevalM2Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)
    
    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_offenseval_m2(device: str) -> ModelConfig:
    model = _OffensevalM2Model('unitary/unbiased-toxic-roberta', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5-backbone_utr.pt', device=device)
    return ModelConfig(
        name='offenseval-2020-v2-pure_reg-mse-1_ep-64_valcycles-lr_2e5-backbone_utr',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('unitary/unbiased-toxic-roberta'))

#### C3

##### c3_m1

In [None]:
class _C3M1Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 256),
            torch.nn.Tanh(),
            torch.nn.Linear(256, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)
    
    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_c3_m1(device: str) -> ModelConfig:
    model = _C3M1Model('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/c3-v1-2ep.pt', device=device)
    return ModelConfig(
        name='c3-v1-2ep',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

##### c3_m2

In [None]:
class _C3M2Model(Model):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.regressor = torch.nn.Sequential(
            # torch.nn.LayerNorm(output_logits),
            torch.nn.Linear(output_logits, 256),
            torch.nn.Tanh(),
            torch.nn.Linear(256, 1),
            torch.nn.Sigmoid(),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        return self.regressor(pooled_output)
    
    def predict_scores(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.forward(input_ids, attention_mask)


def load_c3_m2(device: str) -> ModelConfig:
    model = _C3M2Model('roberta-base', 768, 0.6)
    import_checkpoint(model, '/home/jovyan/jigsaw-toxic/models/c3-v1-2ep-valfreq_10.pt', device=device)
    return ModelConfig(
        name='c3-v1-2ep-valfreq_10',
        model=model,
        tokenizer=AutoTokenizer.from_pretrained('roberta-base'))

### Inference

In [None]:
def do_predict_iteration(
        data_loader: DataLoader,
        model: Model,
        model_name: str,
        device: str) -> np.ndarray:
    model.eval()
    score_list = []
    with torch.no_grad():
        it = tqdm(data_loader, desc=model_name)
        for _, tokenized_text, slice_list in it:
            score_tensor = model.predict_scores(
                tokenized_text['input_ids'].to(device),
                tokenized_text['attention_mask'].to(device),)
            score_tensor = torch.cat([torch.max(score_tensor[s], dim=0, keepdim=True)[0] for s in slice_list], dim=0)
            score_list.extend(score_tensor.cpu().flatten().tolist())
    return torch.tensor(score_list).numpy()

In [None]:
def predict_by_model(
        valid_df: pd.DataFrame,
        batch_size: int,
        model_getter: t.Callable[[str], ModelConfig],
        max_len: int,
        num_workers: int,
        device: str,
        num_iterations: int = 3) -> np.ndarray:
    model_config = model_getter(device)
    model = model_config.model.to(device)
    score_arr_list = []
    wordnet = Wordnet(v=True, n=True, p=0.5)
    for i in range(num_iterations):
        aug_list = []
        if i < num_iterations - 1:
            aug_list.append(lambda text: wordnet.augment(text))
        dataset = PredictDataset(
            df=valid_df,
            tokenizer=model_config.tokenizer,
            max_len=max_len,
            augmentation_list=aug_list)
        data_loader = DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=predict_collate_fn,  # type: ignore
            num_workers=num_workers,
            pin_memory=device.startswith('cuda'))
        score_arr = do_predict_iteration(data_loader=data_loader, model=model, model_name=model_config.name, device=device)
        score_arr_list.append(score_arr)
    return np.stack(score_arr_list, axis=0).mean(axis=0)

In [None]:
valid_df = t.cast(pd.DataFrame, pd.read_csv('/home/jovyan/jigsaw-toxic/data/jigsaw-toxic-severity-rating/valid.csv'))

In [None]:
def valid_to_eval_format(valid_df: pd.DataFrame) -> t.Tuple[pd.DataFrame, pd.DataFrame]:
    comment_id_dict: t.Dict[int, str] = {}
    valid_row_list = []
    for _, row in valid_df.iterrows():
        more_toxic, less_toxic = str(row['more_toxic']), str(row['less_toxic'])
        more_toxic_id, less_toxic_id = hash(more_toxic), hash(less_toxic)
        comment_id_dict[more_toxic_id] = more_toxic
        comment_id_dict[less_toxic_id] = less_toxic
        valid_row_list.append({
            'more_toxic_id': more_toxic_id,
            'less_toxic_id': less_toxic_id,
        })
    eval_row_list = [{'comment_id': comment_id, 'text': text} for comment_id, text in comment_id_dict.items()]
    return pd.DataFrame(valid_row_list), pd.DataFrame(eval_row_list)

In [None]:
valid_df, eval_df = valid_to_eval_format(valid_df)

#### Score generation

##### ccc_2017_m1

In [None]:
ccc2017_m1_score_arr = predict_by_model(
    valid_df=eval_df,
    batch_size=8,
    model_getter=load_ccc2017_m1,
    num_workers=8,
    max_len=256,
    device='cuda')

##### ccc_2017_m2

In [None]:
# ccc2017_m2_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_ccc2017_m2,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### ccc_2017_m3

In [None]:
ccc2017_m3_score_arr = predict_by_model(
    valid_df=eval_df,
    batch_size=8,
    model_getter=load_ccc2017_m3,
    num_workers=8,
    max_len=256,
    device='cuda')

##### ccc_2017_m4

In [None]:
# ccc2017_m4_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_ccc2017_m4,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### ubtc_m1

In [None]:
# ubtc_m1_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_ubtc_m1,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### ubtc_m2

In [None]:
# ubtc_m2_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_ubtc_m2,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### ruddit_m1

In [None]:
ruddit_m1_score_arr = predict_by_model(
    valid_df=eval_df,
    batch_size=8,
    model_getter=load_ruddit_m1,
    num_workers=8,
    max_len=256,
    device='cuda')

##### ruddit_m2

In [None]:
ruddit_m2_score_arr = predict_by_model(
    valid_df=eval_df,
    batch_size=8,
    model_getter=load_ruddit_m2,
    num_workers=8,
    max_len=256,
    device='cuda')

##### wiki_talk_labels_m1

In [None]:
wiki_talk_labels_m1_score_arr = predict_by_model(
    valid_df=eval_df,
    batch_size=8,
    model_getter=load_wiki_talk_labels_m1,
    num_workers=8,
    max_len=256,
    device='cuda')

##### wiki_talk_labels_m2

In [None]:
# wiki_talk_labels_m2_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_wiki_talk_labels_m2,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### offenseval_m1

In [None]:
# offenseval_m1_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_offenseval_m1,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### offenseval_m2

In [None]:
# offenseval_m2_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_offenseval_m2,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### c3_m1

In [None]:
# c3_m1_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_c3_m1,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

##### c3_m2

In [None]:
# c3_m2_score_arr = predict_by_model(
#     valid_df=eval_df,
#     batch_size=8,
#     model_getter=load_c3_m2,
#     num_workers=8,
#     max_len=256,
#     device='cuda')

#### Score backup

In [None]:
eval_df['score_ccc2017_m1'] = ccc2017_m1_score_arr
# eval_df['score_ccc2017_m2'] = ccc2017_m2_score_arr
eval_df['score_ccc2017_m3'] = ccc2017_m3_score_arr
# eval_df['score_ccc2017_m4'] = ccc2017_m4_score_arr
# eval_df['score_ubtc_m1'] = ubtc_m1_score_arr
# eval_df['score_ubtc_m1'] = ubtc_m2_score_arr
eval_df['score_ruddit_m1'] = ruddit_m1_score_arr
eval_df['score_ruddit_m2'] = ruddit_m2_score_arr
eval_df['score_wiki_talk_labels_m1'] = wiki_talk_labels_m1_score_arr
# eval_df['score_wiki_talk_labels_m2'] = wiki_talk_labels_m2_score_arr
# eval_df['offenseval_m1'] = offenseval_m1_score_arr
# eval_df['offenseval_m2'] = offenseval_m2_score_arr
# eval_df['c3_m1'] = c3_m1_score_arr
# eval_df['c3_m2'] = c3_m2_score_arr

In [None]:
eval_df.to_csv('/home/jovyan/jigsaw-toxic/scores/external_datasets_20220207_translate.csv', index=False)

#### Score loading

In [2]:
eval_df = t.cast(pd.DataFrame, pd.read_csv('/home/jovyan/jigsaw-toxic/scores/external_datasets_20220207.csv'))
ccc2017_m1_score_arr = eval_df['score_ccc2017_m1'].to_numpy()
ccc2017_m2_score_arr = eval_df['score_ccc2017_m2'].to_numpy()
ccc2017_m3_score_arr = eval_df['score_ccc2017_m3'].to_numpy()
ccc2017_m4_score_arr = eval_df['score_ccc2017_m4'].to_numpy()
ubtc_m1_score_arr = eval_df['score_ubtc_m1'].to_numpy()
ubtc_m2_score_arr = eval_df['score_ubtc_m1'].to_numpy()
ruddit_m1_score_arr = eval_df['score_ruddit_m1'].to_numpy()
ruddit_m2_score_arr = eval_df['score_ruddit_m2'].to_numpy()
wiki_talk_labels_m1_score_arr = eval_df['score_wiki_talk_labels_m1'].to_numpy()
wiki_talk_labels_m2_score_arr = eval_df['score_wiki_talk_labels_m2'].to_numpy()
offenseval_m1_score_arr = eval_df['offenseval_m1']
offenseval_m2_score_arr = eval_df['offenseval_m2']
c3_m1_score_arr = eval_df['c3_m1']
c3_m2_score_arr = eval_df['c3_m2']

In [3]:
def process_valid(valid_df: pd.DataFrame, eval_df: pd.DataFrame) -> pd.DataFrame:
    valid_df = valid_df.copy()
    text_to_comment_id = {str(row['text']): int(row['comment_id']) for _, row in eval_df.iterrows()}
    valid_df['more_toxic_id'] = valid_df['more_toxic'].apply(lambda text: text_to_comment_id[text])
    valid_df['less_toxic_id'] = valid_df['less_toxic'].apply(lambda text: text_to_comment_id[text])
    return valid_df

In [4]:
valid_df = t.cast(pd.DataFrame, pd.read_csv('/home/jovyan/jigsaw-toxic/data/jigsaw-toxic-severity-rating/valid.csv'))

In [5]:
valid_df

Unnamed: 0,less_toxic,more_toxic,certainty
0,"I think people say its not so absurd, because ...",Re Vandalism. . . . that is because the user ...,0.666667
1,You must have the lowest IQ of anyone on wikip...,Wha's a reliable source mike? Something that c...,1.000000
2,"Thanks for removing more of his anonymous, un...","To Bad\nyou dont have the balls to sign in, I ...",1.000000
3,"""\n\nActually, it clearly does not involve pow...",", 18 November 2006 (UTC)\n\nWell comrade. Look...",0.666667
4,You are not cool \n\nYou are the most hated f...,"""== Hey Fuck-head==\nHey, just wanted to say ""...",1.000000
...,...,...,...
10103,Heeeeeeeeeeeeeeyyyyyyyyyyyy dude! Sup!,you deleted my page \nand i am extrememly mad ...,1.000000
10104,ATTENTION:''''''\n\nANYONE WHO OPPOSES MY OPIN...,suck dick u disrespectful swearing wank!!,1.000000
10105,"Oh yah, and Bayerischermann is another guy try...",Thanks. Also thanks for freezing the article. ...,1.000000
10106,I had to chop out a section specifically compa...,"""\n\nHorrifyingly enough, """"ritualistic penis ...",0.666667


In [6]:
valid_df = process_valid(valid_df, eval_df)

In [7]:
valid_df

Unnamed: 0,less_toxic,more_toxic,certainty,more_toxic_id,less_toxic_id
0,"I think people say its not so absurd, because ...",Re Vandalism. . . . that is because the user ...,0.666667,8871839934638558331,-4761684140996904817
1,You must have the lowest IQ of anyone on wikip...,Wha's a reliable source mike? Something that c...,1.000000,1637230891836237423,-7956336680269787100
2,"Thanks for removing more of his anonymous, un...","To Bad\nyou dont have the balls to sign in, I ...",1.000000,-4417861673610340204,-9111912784081641002
3,"""\n\nActually, it clearly does not involve pow...",", 18 November 2006 (UTC)\n\nWell comrade. Look...",0.666667,538743044374089943,1009349673192730117
4,You are not cool \n\nYou are the most hated f...,"""== Hey Fuck-head==\nHey, just wanted to say ""...",1.000000,-8395351920140636842,-8465358288158148543
...,...,...,...,...,...
10103,Heeeeeeeeeeeeeeyyyyyyyyyyyy dude! Sup!,you deleted my page \nand i am extrememly mad ...,1.000000,-587030008757665900,-6157675788695320718
10104,ATTENTION:''''''\n\nANYONE WHO OPPOSES MY OPIN...,suck dick u disrespectful swearing wank!!,1.000000,6392442731289747528,-2454273551981799926
10105,"Oh yah, and Bayerischermann is another guy try...",Thanks. Also thanks for freezing the article. ...,1.000000,3818525241629339148,7225573479073611155
10106,I had to chop out a section specifically compa...,"""\n\nHorrifyingly enough, """"ritualistic penis ...",0.666667,-2636152035767601275,-3759500075766815347


In [8]:
eval_df

Unnamed: 0,comment_id,text,score_ccc2017_m1,score_ccc2017_m2,score_ccc2017_m3,score_ccc2017_m4,score_ubtc_m1,score_ruddit_m1,score_ruddit_m2,score_wiki_talk_labels_m1,score_wiki_talk_labels_m2,offenseval_m1,offenseval_m2,c3_m1,c3_m2
0,8871839934638558331,Re Vandalism. . . . that is because the user ...,0.130588,0.240643,0.290199,0.264418,0.280005,0.549056,0.417141,0.315535,0.257190,0.571000,0.493509,0.346553,0.296056
1,-4761684140996904817,"I think people say its not so absurd, because ...",0.085516,0.103158,0.090866,0.187912,0.263704,0.540882,0.606414,0.292525,0.227664,0.529195,0.516643,0.353258,0.299012
2,1637230891836237423,Wha's a reliable source mike? Something that c...,0.546212,0.623776,0.574130,0.583871,0.871073,0.892588,0.869146,0.895831,0.915215,0.765286,0.786490,0.481011,0.445201
3,-7956336680269787100,You must have the lowest IQ of anyone on wikip...,0.304286,0.387672,0.405968,0.419521,0.802019,0.679649,0.741014,0.765444,0.822827,0.726740,0.722767,0.462803,0.440642
4,-4417861673610340204,"To Bad\nyou dont have the balls to sign in, I ...",0.475488,0.442890,0.586314,0.485561,0.758346,0.780608,0.765925,0.752058,0.718816,0.660960,0.719132,0.472417,0.430983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14246,-3367261662900040048,"Guess you never heard of Gaylord Perry, one of...",0.123053,0.143000,0.100866,0.207610,0.232839,0.500420,0.466802,0.231475,0.199985,0.427846,0.438729,0.326421,0.272859
14247,-587030008757665900,you deleted my page \nand i am extrememly mad ...,0.135024,0.176959,0.156531,0.180241,0.169857,0.377321,0.353516,0.284193,0.174090,0.412900,0.449750,0.276390,0.235702
14248,6392442731289747528,suck dick u disrespectful swearing wank!!,0.595961,0.646268,0.638236,0.659873,0.867191,0.916613,0.880175,0.915201,0.916077,0.941515,0.804194,0.439001,0.393956
14249,-2454273551981799926,ATTENTION:''''''\n\nANYONE WHO OPPOSES MY OPIN...,0.135181,0.296563,0.280414,0.361201,0.423764,0.631425,0.516870,0.520577,0.349602,0.456775,0.487076,0.369993,0.312742


In [9]:
!mkdir /home/jovyan/jigsaw-toxic/data/datasets/external_20220207_stacking

In [10]:
valid_df.to_csv('/home/jovyan/jigsaw-toxic/data/datasets/external_20220207_stacking/pair.csv', index=False)
eval_df.to_csv('/home/jovyan/jigsaw-toxic/data/datasets/external_20220207_stacking/feature.csv', index=False)

#### Ensembling

In [None]:
def ensemble_predictions_by_mean(
        valid_df: pd.DataFrame,
        eval_df: pd.DataFrame,
        score_arr_list: t.List[np.ndarray],
        apply_rankdata: bool = False) -> t.Tuple[np.ndarray, np.ndarray]:
    if apply_rankdata:
        score_arr_list = [rankdata(score_arr, method='ordinal') for score_arr in score_arr_list]
    score_arr = np.stack(score_arr_list, axis=0).mean(axis=0)
    comment_id_to_score_dict = dict(zip(eval_df['comment_id'].tolist(), score_arr.flatten().tolist()))
    more_toxic_score_list, less_toxic_score_list = [], []
    for _, row in valid_df.iterrows():
        more_toxic_score_list.append(comment_id_to_score_dict[int(row['more_toxic_id'])])
        less_toxic_score_list.append(comment_id_to_score_dict[int(row['less_toxic_id'])])
    return np.array(more_toxic_score_list), np.array(less_toxic_score_list)

In [None]:
def powerset(iterable: t.Iterable[str]) -> t.Iterable[t.Tuple[str, ...]]:
    s = list(iterable)
    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s)+1))

##### Score dict

In [None]:
score_dict = dict([
    ('ccc2017_m1_score_arr', ccc2017_m1_score_arr),
    # ('ccc2017_m2_score_arr', ccc2017_m2_score_arr),
    ('ccc2017_m3_score_arr', ccc2017_m3_score_arr),
    # ('ccc2017_m4_score_arr', ccc2017_m4_score_arr),
    # ('ubtc_m1_score_arr', ubtc_m1_score_arr),
    # ('ubtc_m2_score_arr', ubtc_m2_score_arr),
    ('ruddit_m1_score_arr', ruddit_m1_score_arr),
    ('ruddit_m2_score_arr', ruddit_m2_score_arr),
    ('wiki_talk_labels_m1_score_arr', wiki_talk_labels_m1_score_arr),
    # ('wiki_talk_labels_m2_score_arr', wiki_talk_labels_m2_score_arr),
    # ('offenseval_m1', offenseval_m1_score_arr),
    # ('offenseval_m2', offenseval_m2_score_arr),
    # ('c3_m1', c3_m1_score_arr),
    # ('c3_m2', c3_m2_score_arr),
])

##### Mean

In [None]:
more_toxic_score_arr, less_toxic_score_arr = ensemble_predictions_by_mean(
    valid_df=valid_df,
    eval_df=eval_df,
    score_arr_list=[
        ccc2017_m1_score_arr,
        ccc2017_m3_score_arr,
        ruddit_m1_score_arr,
        ruddit_m2_score_arr,
        wiki_talk_labels_m1_score_arr,
    ],
    apply_rankdata=True)
(more_toxic_score_arr > less_toxic_score_arr).astype(np.float32).mean()

In [None]:
# mean_ensemble_result_dict: t.Dict[str, float] = {}
# _it = tqdm(sorted([sub for sub in powerset(score_dict.keys()) if len(sub) >= 4], key=len, reverse=True))
# for score_key_set in _it:
#     if not score_key_set:
#         continue
#     more_toxic_score_arr, less_toxic_score_arr = ensemble_predictions_by_mean(
#         valid_df=valid_df,
#         eval_df=eval_df,
#         score_arr_list=[score_dict[score_key] for score_key in list(score_key_set)],
#         apply_rankdata=True,
#     )
#     key = ' '.join(sorted(score_key_set))
#     score = (more_toxic_score_arr > less_toxic_score_arr).astype(np.float32).mean()
#     mean_ensemble_result_dict[key] = score
#     best_key, best_score = max(mean_ensemble_result_dict.items(), key=lambda x: x[1])
#     _it.set_description(f'Best score: {best_score:.5f}, best key: {best_key}')

Best score: 0.76059
Best models:
 - ccc2017_m1_score_arr
 - ccc2017_m3_score_arr 
 - ruddit_m1_score_arr
 - ruddit_m2_score_arr
 - wiki_talk_labels_m1_score_arr

##### Sort

In [None]:
# sort_ensemble_result_dict: t.Dict[str, float] = {}
# _it = tqdm(sorted([sub for sub in powerset(score_dict.keys()) if len(sub) >= 4], key=len, reverse=True))
# for score_key_set in _it:
#     if not score_key_set:
#         continue
#     more_toxic_score_arr, less_toxic_score_arr = ensemble_predictions_by_sort(
#         valid_df=valid_df,
#         eval_df=eval_df,
#         score_arr_list=[score_dict[score_key] for score_key in list(score_key_set)],
#         distribution=PowerDistribution(min=0.0, max=1.0, power=1.0),
#         apply_rankdata=False)
#     key = ' '.join(sorted(score_key_set))
#     score = (more_toxic_score_arr > less_toxic_score_arr).astype(np.float32).mean()
#     sort_ensemble_result_dict[key] = score
#     best_key, best_score = max(sort_ensemble_result_dict.items(), key=lambda x: x[1])
#     _it.set_description(f'Best score: {best_score:.5f}, best key: {best_key}')