In [1]:
from __future__ import annotations

import re
import typing as t

import pandas as pd
import torch
import typing_extensions as t_ext
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, DataLoader
from transformers.models.auto.modeling_auto import AutoModel
from transformers.models.auto.tokenization_auto import AutoTokenizer

In [2]:
class TextCleaner:
    _RE_WEBSITE_LINK = re.compile(r'https?://\S+|www\.\S+')
    _RE_EMOJI = re.compile('['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+', flags=re.UNICODE)
    _RE_SPECIAL_CHARACTERS = re.compile(r'[^a-zA-Z\d]')
    _RE_EXTRA_SPACES = re.compile(r' +')

    def __init__(self):
        pass

    def clean(self, text: str) -> str:
        """
        Cleans text into a basic form for NLP. Operations include the following:-
        1. Remove special charecters like &, #, etc
        2. Removes extra spaces
        3. Removes embedded URL links
        4. Removes HTML tags
        5. Removes emojis
        
        text - Text piece to be cleaned.
        """
        text = self._RE_WEBSITE_LINK.sub(r'', text)
        
        soup = BeautifulSoup(text, 'lxml')  # Removes HTML tags
        only_text = soup.get_text()
        text = only_text

        text = self._RE_EMOJI.sub(r'', text)
        
        text = self._RE_SPECIAL_CHARACTERS.sub(" ", text)  # Remove special Charecters
        text = self._RE_EXTRA_SPACES.sub(' ', text)  # Remove Extra Spaces
        text = text.strip()  # Remove spaces at the beginning and at the end of string

        return text

In [3]:
class _TokenizedText(t_ext.TypedDict):
    input_ids: torch.Tensor
    attention_mask: torch.Tensor
    token_type_ids: torch.Tensor


def _preprocess_tokenizer_output(output: t.Dict[str, t.Any]) -> _TokenizedText:
    return {
        'input_ids': torch.tensor(output['input_ids']),
        'attention_mask': torch.tensor(output['attention_mask']),
        'token_type_ids': torch.tensor(output['token_type_ids']),
    }


class InferenceDataset(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_len: int) -> None:
        super().__init__()
        self._df = df
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._cleaner = TextCleaner()

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, idx: int) -> _TokenizedText:
        record = self._df.iloc[idx]
        tokenized_text = _preprocess_tokenizer_output(self._tokenizer(
            self._cleaner.clean(str(record['text'])),
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self._max_len,
            return_attention_mask=True,
            return_token_type_ids=True))  # type: ignore
        return tokenized_text

    def get_comment_id_iter(self) -> t.Iterable[str]:
        return [str(row['comment_id']) for _, row in self._df.iterrows()]

In [4]:
class Model(torch.nn.Module):

    def __init__(self, checkpoint: str, output_logits: int, dropout: float):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.layer_norm = torch.nn.LayerNorm(output_logits)
        self.dropout = torch.nn.Dropout(dropout)
        self.dense = torch.nn.Sequential(
            torch.nn.Linear(output_logits, 128),
            torch.nn.SiLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(128, 1)
        )

    def forward(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        _, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds

In [5]:
def main(
        df: pd.DataFrame,
        dst_path: str,
        transformer_dir_path: str,
        model_checkpoint: str,
        output_logits: int,
        dropout: float,
        max_len: int,
        batch_size: int,
        num_workers: int,
        device: str):
    model = Model(checkpoint=transformer_dir_path, output_logits=output_logits, dropout=dropout)
    model.load_state_dict(torch.load(model_checkpoint, map_location=device))
    model.eval()
    dataset = InferenceDataset(df=df, tokenizer=AutoTokenizer.from_pretrained(transformer_dir_path), max_len=max_len)
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=device.startswith('cuda'))
    score_list = []
    with torch.no_grad():
        for tokenized_text in data_loader:
            score_tensor = model(
                tokenized_text['input_ids'].to(device),
                tokenized_text['token_type_ids'].to(device),
                tokenized_text['attention_mask'].to(device),)
            score_list.extend(score_tensor.flatten().tolist())
    pd.DataFrame([
        {'comment_id': comment_id, 'score': score}
        for comment_id, score in zip(dataset.get_comment_id_iter(), score_list)
    ]).to_csv(dst_path, index=False)

In [6]:
INPUT_CSV_PATH = '/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv'
OUTPUT_CSV_PATH = '/kaggle/working/submission.csv'
MODEL_PATH = '/kaggle/input/jigsaw-toxic-offenseval-2020-regression-output/models/offenseval-2020-regression.pt'
TRANSFORMER_DIR_PATH = '/kaggle/input/roberta-base'

In [None]:
main(
    df=pd.read_csv(INPUT_CSV_PATH),
    dst_path=OUTPUT_CSV_PATH,
    transformer_dir_path=TRANSFORMER_DIR_PATH,
    model_checkpoint=MODEL_PATH,
    output_logits=768,
    dropout=0.2,
    max_len=256,
    batch_size=64,
    num_workers=2,
    device='cpu')