In [None]:
# Update Notes by Ertugrul:
# - FocalLoss
# - Objective metric
# - data_collator 
# - Hyperparameter seach
# - Deterministic

In [None]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import torch
import torch.nn as nn


import datasets
from datasets import load_dataset, load_metric
from sklearn.metrics import classification_report, f1_score
from scipy.special import expit as sigmoid

import optuna
import random

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer


os.environ["WANDB_DISABLED"] = "true"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [None]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state


random_state = set_seed(42)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

In [None]:
intent = datasets.load_dataset("deprem-private/deprem_intent_classification", "intent_multilabel")

In [None]:
name2ix = {'Alakasiz': 0, 'Barinma': 1, 'Elektronik': 2, 'Giysi': 3, 'Kurtarma': 4, 'Lojistik': 5, 'Saglik': 6, 'Su': 7, 'Yagma': 8, 'Yemek': 9}
ix2name = {v: k for k, v in name2ix.items()}

In [None]:
df_train = pd.DataFrame().from_records(list(intent["train"]))
df_val = pd.DataFrame().from_records(list(intent["validation"]))
df_test = pd.DataFrame().from_records(list(intent["test"]))

In [None]:
label_col = "labels"
text_col = "text_cleaned"

df_train = df_train[df_train[label_col].notnull()].reset_index(drop=True)
df_val = df_val[df_val[label_col].notnull()].reset_index(drop=True)

df_test = df_test[df_test[label_col].notnull()].reset_index(drop=True)

In [None]:
df_train.loc[1, "text_cleaned"]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=list(name2ix.values()))
mlb_labels = mlb.fit_transform(df_train.labels.tolist())

In [None]:
mlb_labels

In [None]:
labels = set()
for label in df_train.labels.values:
    labels.update({l for l in label})

labels = list(sorted(labels))
print(labels)
label2idx = {label: idx for idx, label in enumerate(labels)}
print(label2idx)

In [None]:

model_name = "dbmdz/bert-base-turkish-128k-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=128)

In [None]:
token_counts = []

for _, row in df_train.iterrows():

  token_count = len(tokenizer.encode(

    row["text"],

    max_length=128,

    truncation=True

  ))

  token_counts.append(token_count)

sns.histplot(token_counts)

plt.xlim([0, 128]);

In [None]:
class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, df, num_classes=len(labels)):
        self.df = df
        self.num_classes = num_classes
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text, label = row.text_cleaned, self._encode_label(row.labels)
        encoding = tokenizer(text, max_length=128, truncation=True)
        encoding = {key: torch.tensor(val, dtype=torch.int64) for key, val in encoding.items()}
        encoding["labels"] = torch.tensor(label, dtype=torch.float32)
        return dict(encoding)
    
    def _encode_label(self, input_labels):
        encoded_labels = np.zeros(self.num_classes)
        for label in input_labels:
            encoded_labels[label2idx[label]] = 1.0
        return encoded_labels

In [None]:
ds = IntentDataset(df_train)

In [None]:
def model_init():
    return (AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           label2id=name2ix,
                                                           id2label=ix2name                      
                                                          ).to('cuda'))

In [None]:
occs = np.sum(mlb_labels[df_train.index],
       axis=0)

occ_ratios = (mlb_labels.sum() / mlb_labels.sum(axis=0))
occ_ratios /= occ_ratios.min()
occ_ratios = np.power(occ_ratios, 1/3)

class_weights = dict(zip(np.arange(mlb_labels.shape[1]), occ_ratios))
class_weights

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, pos_weight, alpha=0.1, gamma=2., reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = pos_weight.to('cuda')

    def forward(self, inputs, targets):
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss

        if self.reduction == 'mean':
            return F_loss.mean()
        elif self.reduction == 'sum':
            return F_loss.sum()
        else:
            return F_loss

In [None]:
class ImbalancedTrainer(Trainer):
    def __init__(self, inp_class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # You pass the class weights when instantiating the Trainer
        self.class_weights = torch.Tensor(list(inp_class_weights.values())).cuda()

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.

            # Changes start here
            # loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            logits = outputs['logits']
            criterion = FocalLoss(pos_weight=self.class_weights)
            loss = criterion(logits, inputs['labels'])
            # Changes end here

        return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = sigmoid(pred.predictions)
    y_pred = (y_pred>0.5).astype(float)
    clf_dict = classification_report(y_true, y_pred,
    zero_division=0, output_dict=True)
    return {"micro f1": clf_dict["micro avg"]["f1-score"],
"macro f1": clf_dict["macro avg"]["f1-score"]}

In [None]:
batch_size = 16
step_size = int(np.ceil(len(df_train) / batch_size) / 4) - 1

In [None]:
HP_SEARCH = True

if HP_SEARCH:
    basic_args = TrainingArguments(
        f"turkish_multilabel_intent_{model_name.split('/')[-1]}",
        fp16=True,
        evaluation_strategy = "epoch",
        save_strategy = "no",
        #learning_rate=2e-5,
        #per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size*2,
        #num_train_epochs=4,
        #weight_decay=0.01,
        load_best_model_at_end=False,
         metric_for_best_model="macro f1",
        # eval_steps = step_size,
        # save_steps = step_size,
        # logging_steps = step_size,
        seed = 42,
        data_seed = 42,
        dataloader_num_workers = 0,
        #lr_scheduler_type = 'linear',
        #warmup_steps=0,                # number of warmup steps for learning rate scheduler
        #weight_decay=0,               # strength of weight decay
        #save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.
        full_determinism = True,
        group_by_length = True
    )

    trainer = ImbalancedTrainer(
        inp_class_weights=class_weights,                    
        model_init=model_init,
        args=basic_args,
        data_collator=data_collator,
        train_dataset=IntentDataset(df_train),
        eval_dataset=IntentDataset(df_val),
        compute_metrics=compute_metrics,
    )

    def hp_space(trial):
        return {
            "num_train_epochs" : trial.suggest_int("num_train_epochs",2,5),
            "learning_rate" : trial.suggest_float("learning_rate", 1e-7, 1e-4),
            "weight_decay" : trial.suggest_float("weight_decay", 0.001, 0.1),
            "lr_scheduler_type" : trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine"]),
            "per_device_train_batch_size" : trial.suggest_int("per_device_train_batch_size",8,32,8),
            "warmup_steps" : trial.suggest_int("warmup_steps",0,150,10),
        }
    

    def compute_objective(metrics):
        return metrics["eval_macro f1"]

    best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize", hp_space=hp_space, compute_objective=compute_objective)

In [None]:
best_run

In [None]:
training_args = TrainingArguments(
    f"turkish_multilabel_intent_{model_name.split('/')[-1]}",
    fp16=True,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    learning_rate=best_run[-1]['learning_rate'],
    per_device_train_batch_size=best_run[-1]['per_device_train_batch_size'],
    per_device_eval_batch_size=batch_size*2,
    num_train_epochs=4,
    #weight_decay=0.01,
    load_best_model_at_end=True,
     metric_for_best_model="macro f1",
    eval_steps = step_size,
    save_steps = step_size,
    logging_steps = step_size,
    seed = 42,
    data_seed = 42,
    dataloader_num_workers = 0,
    lr_scheduler_type = best_run[-1]['lr_scheduler_type'],
    warmup_steps=best_run[-1]['warmup_steps'],               # number of warmup steps for learning rate scheduler
    weight_decay=best_run[-1]['weight_decay'],               # strength of weight decay
    save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.
    full_determinism = True,
    group_by_length = True
)

trainer = ImbalancedTrainer(
    inp_class_weights=class_weights,                    
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    train_dataset=IntentDataset(df_train),
    eval_dataset=IntentDataset(df_val),
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
preds = trainer.predict(IntentDataset(df_test))

In [None]:
best_thr = -1
best_score = 0.

for threshold in np.arange(.1, 1., .03):
    score = f1_score(preds.label_ids.astype(int), (sigmoid(preds.predictions) > threshold).astype(int), average="macro")
    if score>best_score:
        best_score = score
        best_thr = threshold

best_thr, best_score

In [None]:
print(classification_report(preds.label_ids.astype(int), (sigmoid(preds.predictions) > best_thr).astype(int), target_names=name2ix.keys()))