# Jigsaw Multilingual Benchmark 
> Benchmark model for Jigsaw Multilingual Toxic Comment Classification.
- toc: true
- badges: true
- comments: true
- author: Aman Arora

In this post we implement our first multilingual model using mBert and we make a submission to check the baseline score for a multilingual toxic comment classifier. 

The idea was first presented in [my first introduction post](https://amaarora.github.io/jigsaw/2020/04/10/JigsawIntro.html).
> We could find multiple datasets outside of the one provided and train/fine-tune a multilingual model to classify for toxicity.

In this post we make use of this [dataset](https://www.kaggle.com/miklgr500/jigsaw-train-multilingual-coments-google-api#jigsaw-toxic-comment-train-google-fr-cleaned.csv) where the author has translated the training dataset into 6 languages ('es', 'fr', 'it', 'pt', 'ru', 'tr').

In [1]:
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
from tqdm.notebook import tqdm
import torch.nn as nn
import torch
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.info("test")

INFO:root:test


We follow the [jigsaw benchmark](https://amaarora.github.io/jigsaw/2020/04/11/Jigsaw-Benchmark.html) model to create the same `Tokenizer`, `Bert Model` and only make some changes to the `Dataset`.

## Tokenizer

In [3]:
class Tokenizer:
    def __init__(self, model_name):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        
    def __call__(self, input, **kwargs):
        return self.tokenizer.encode_plus(input, **kwargs)

## Bert Model

In [4]:
class BertModel:
    def __init__(self, model_name, no=1):
        self.model = BertForSequenceClassification.from_pretrained(model_name)
        if self.model.classifier.out_features != no: self.model.classifier = nn.Linear(768, 1, bias=True)
    
    def __call__(self, *args, **kwargs):
        if 'target' in kwargs.keys(): del kwargs['target']
        return self.model(*args, **kwargs)
    
    def __getattr__(self, attr):
        return getattr(self.model, attr)

In [5]:
model = BertModel('bert-base-multilingual-cased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json from cache at /home/ubuntu/.cache/torch/transformers/45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.893eae5c77904d1e9175faf98909639d3eb20cc7e13e2be395de9a0d8a0dad52
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0"

## Dataset

In this case, our dataframe for training is a concatenation of the 6 different datasets provided [here](https://www.kaggle.com/miklgr500/jigsaw-train-multilingual-coments-google-api#jigsaw-toxic-comment-train-google-fr-cleaned.csv).

In [6]:
def get_training_df(path, usecols):
    df = pd.read_csv(path, usecols=usecols)
    df = df.query("toxic=='1'|toxic=='0'").copy()
    df['toxic'] = df.toxic.astype(int)
    toxic = df.query("toxic==1").copy().reset_index(drop=True)
    _toxic = df.query("toxic==0").sample(len(toxic), random_state=123).reset_index(drop=True)
    df = pd.concat([toxic, _toxic]).reset_index(drop=True)
    return df

In [7]:
df_es = get_training_df("./multilingual_data/jigsaw-toxic-comment-train-google-es.csv", ['comment_text', 'toxic'])
df_fr = get_training_df("./multilingual_data/jigsaw-toxic-comment-train-google-fr.csv", ['comment_text', 'toxic'])
df_it = get_training_df("./multilingual_data/jigsaw-toxic-comment-train-google-it.csv", ['comment_text', 'toxic'])
df_pt = get_training_df("./multilingual_data/jigsaw-toxic-comment-train-google-pt.csv", ['comment_text', 'toxic'])
df_ru = get_training_df("./multilingual_data/jigsaw-toxic-comment-train-google-ru.csv", ['comment_text', 'toxic'])
df_tr = get_training_df("./multilingual_data/jigsaw-toxic-comment-train-google-tr.csv", ['comment_text', 'toxic'])

In [8]:
df = pd.concat([df_es, df_fr, df_it, df_pt, df_ru, df_tr]).reset_index(drop=True)
df.shape

(256496, 2)

We are now ready to create the `BertDataset`.

In [9]:
class BertDataset():
    def __init__(self, shuffle=False, csv_path=None, usecols=None, model_name='bert-base-multilingual-cased', max_len=512, textcol='translated', train=True, df=None):
        self.df = pd.read_csv(csv_path, usecols=None if usecols is None else usecols) if df is None else df
        if shuffle: self.df = self.df.sample(frac=1)
        self.tok = Tokenizer(model_name)
        self.max_len = max_len
        self.textcol = textcol
        self.train = train

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        comment = self.df.iloc[idx][self.textcol]
        if self.train: target  = self.df.iloc[idx].toxic
    
        # encode comment
        t_out = self.tok(comment, add_special_tokens=True, max_length=self.max_len)
        input_ids = t_out['input_ids']
        token_type_ids = t_out['token_type_ids']
        attention_mask = t_out['attention_mask']

        # pad sequences 
        padding_length = self.max_len - len(input_ids)
        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.float)
        } if self.train else {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [10]:
dataset = BertDataset(df=df, textcol='comment_text', shuffle=True)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /home/ubuntu/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729


## Dataloader

In [11]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)

## Train

We keep pretty much everything the same in the training loop as before but only make small changes to get `valid_auc_score` every 1000 batches.

In [12]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

In [13]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

In [14]:
num_train_steps = int(len(dataset) / 8 * 5)

In [15]:
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)
device = 'cuda'

In [16]:
model = model.to('cuda')

In [17]:
def get_valid_preds(model, val_dataloader):
    model.eval()
    preds = []
    with torch.no_grad():    
        for bi, d in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
            input_ids = d["input_ids"]
            attention_mask = d["attention_mask"]
            token_type_ids = d["token_type_ids"]

            input_ids = input_ids.to(device, dtype=torch.long)
            attention_mask = attention_mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )[0]
            outputs_np = outputs.cpu().detach().numpy().tolist()
            preds.extend(outputs_np)
    return preds

In [18]:
def get_valid_loss(val_dataloader, ys):
    preds = get_valid_preds(model, val_dataloader)
    preds = np.array(preds)
    return roc_auc_score(ys, preds)

In [19]:
val_df = pd.read_csv("./validation.csv")
ys = np.array(val_df.toxic)
val_dataset = BertDataset(csv_path="./validation.csv", usecols=['comment_text', 'toxic'], textcol='comment_text', train=False)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=8)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /home/ubuntu/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729


In [20]:
#slow
model.train()

for epoch in range(5):
    for bi, d in tqdm(enumerate(dataloader), total=len(dataloader)):
        input_ids = d["input_ids"]
        attention_mask = d["attention_mask"]
        token_type_ids = d["token_type_ids"]
        targets = d["target"]

        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )[0]

        loss = loss_fn(outputs, targets)
        if bi%1000==0: 
            val_score = get_valid_loss(val_dataloader, ys)
            logger.info(f"epoch: {epoch}, train loss: {loss}, val auc score: {val_score}")
        
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
    torch.save(model.state_dict(), f"epoch_{epoch}.bin")

## Predictions

Now once the model is trained we are ready to make predictions, we create a test dataset with the `jigsaw_miltilingual_test_translated.csv` file and make predictions on this file. 

We follow the same baseline approach and get a score of `.8680` on the test dataset.