In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip3 install transformers
!pip3 install datasets
!pip install sentencepiece
%load_ext tensorboard

# Code

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
DATA_PATH = './drive/MyDrive/ML/Nuage/jigsaw/data'
MODEL_PATH = './drive/MyDrive/ML/Nuage/jigsaw/models'
os.listdir(DATA_PATH)

In [None]:
label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

df1 = pd.read_csv(f'{DATA_PATH}/jigsaw-toxic-comment-train.csv').rename(columns={"comment_text": "text"}).sample(frac=1, random_state=42)
df1['labels']= df1[label_cols].values.astype(float).tolist()

df2 = pd.read_csv(f'{DATA_PATH}/jigsaw-unintended-bias-train.csv').rename(columns={
    "comment_text": "text",
    "severe_toxicity":"severe_toxic",
    "identity_attack":"identity_hate"
}).sample(frac=1, random_state=42)
for col in label_cols:
  df2[col] = (df2[col] > 0.5).astype(float)
df2['labels']= df2[label_cols].values.tolist()

# Sampling and merge
df=pd.concat((df1,df2))
df1=df[
  (df.toxic.astype(int)==1)  |
  (df.severe_toxic.astype(int)==1) |
  (df.obscene.astype(int)==1)  |
  (df.threat.astype(int)==1)  |
  (df.insult.astype(int)==1)  |
  (df.identity_hate.astype(int)==1)
]
df2=df[(df.toxic==0)&(df.obscene==0)&(df.threat==0)&(df.insult==0)&(df.identity_hate==0)][:len(df1)//2]
df=pd.concat((df1,df2))[["text","labels"]]

#df = pd.concat((df1[["text","labels"]],df2[["text","labels"]]))
#del df1,df2
#df=df1.copy()
#del df1

print(f"Total: {len(df):,}")
df.head()

In [None]:
id2label = dict([(i,label) for i, label in enumerate(label_cols)])
label2id = dict([(label,i) for i, label in enumerate(label_cols)])
id2label

In [None]:
split_index = int(len(df)*0.95)
train_dataset = Dataset.from_pandas(df[["text","labels"]][:split_index])
test_dataset = Dataset.from_pandas(df[["text","labels"]][split_index:])

train_dataset = train_dataset.train_test_split(test_size=0.05)
print(train_dataset)
print(test_dataset)

In [None]:
ROBERTA_MODEL = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL)
model = XLMRobertaForSequenceClassification.from_pretrained(ROBERTA_MODEL, num_labels=len(label_cols), problem_type="multi_label_classification", id2label=id2label, label2id=label2id)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    try:
      roc_auc_micro = roc_auc_score(y_true, y_pred, average = 'micro')
      roc_auc_macro = roc_auc_score(y_true, y_pred, average = 'macro')
    except:
      roc_auc_micro=0
      roc_auc_macro=0
    accuracy = accuracy_score(y_true, y_pred)

    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    # return as dictionary
    metrics = {'f1_macro': f1_macro_average,
               'f1_micro': f1_micro_average,
               'roc_auc_micro': roc_auc_micro,
               'roc_auc_macro': roc_auc_macro,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
training_args = TrainingArguments(
    output_dir='./checkpoints',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=2000,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #eval_steps=10000,
    #save_steps=10000,
    learning_rate=2e-5,
)
training_args.device

In [None]:
from torch.nn import BCEWithLogitsLoss

def calculate_pos_weights(class_counts):
  pos_weights = np.ones_like(class_counts)
  neg_counts = [len(y_train)-pos_count for pos_count in class_counts]
  for cdx, (pos_count, neg_count) in enumerate(zip(class_counts,  neg_counts)):
    pos_weights[cdx] = neg_count / (pos_count + 1e-5)

  return torch.as_tensor(pos_weights, dtype=torch.float)

y_train = np.array(df[["text","labels"]][:split_index]['labels'].to_list())
weights = calculate_pos_weights(y_train.sum(axis=0)).to('cuda')
weights

In [None]:
custom_loss = BCEWithLogitsLoss(pos_weight=weights)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
      target = inputs.get("labels")
      outputs = model(**inputs)
      logits = outputs.get("logits")

      loss = custom_loss(logits, target)
      return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset["train"],
    eval_dataset=train_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()
trainer.save_model(MODEL_PATH+"/trained_v1")