In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
!pip install gurobipy[matrixapi]==11.0.2 -q
!pip install numpy==1.26.4 -q
!pip install scipy==1.13.1 -q

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/semeval_translated_csv/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/semeval_translated_csv/developed.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/semeval_translated_csv/testing.csv')

In [5]:
from datasets import load_dataset, DatasetDict, Dataset

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df),
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['teks', 'amarah', 'antisipasi', 'menjijikkan', 'takut', 'sukacita', 'Cinta', 'optimisme', 'pesimisme', 'kesedihan', 'kejutan', 'memercayai'],
        num_rows: 6838
    })
    validation: Dataset({
        features: ['teks', 'amarah', 'antisipasi', 'menjijikkan', 'takut', 'sukacita', 'Cinta', 'optimisme', 'pesimisme', 'kesedihan', 'kejutan', 'memercayai'],
        num_rows: 886
    })
    test: Dataset({
        features: ['teks', 'amarah', 'antisipasi', 'menjijikkan', 'takut', 'sukacita', 'Cinta', 'optimisme', 'pesimisme', 'kesedihan', 'kejutan', 'memercayai'],
        num_rows: 3259
    })
})


In [6]:
example = dataset['train'][0]
example

{'teks': ' khawatir masalah pembayaran mungkin tidak pernah joyce meyer motivasi kepemimpinan khawatir',
 'amarah': 0,
 'antisipasi': 1,
 'menjijikkan': 0,
 'takut': 0,
 'sukacita': 0,
 'Cinta': 0,
 'optimisme': 1,
 'pesimisme': 0,
 'kesedihan': 0,
 'kejutan': 0,
 'memercayai': 1}

In [7]:
labels = [label for label in dataset['train'].features.keys() if label not in ['teks']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['amarah',
 'antisipasi',
 'menjijikkan',
 'takut',
 'sukacita',
 'Cinta',
 'optimisme',
 'pesimisme',
 'kesedihan',
 'kejutan',
 'memercayai']

In [8]:
model = "indobenchmark/indobert-large-p1"

In [9]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(model)

def preprocess_data(examples):
  # take a batch of texts
  text = examples["teks"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/6838 [00:00<?, ? examples/s]

Map:   0%|          | 0/886 [00:00<?, ? examples/s]

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

In [11]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [12]:
tokenizer.decode(example['input_ids'])

'[CLS] khawatir masalah pembayaran mungkin tidak pernah joyce meyer motivasi kepemimpinan khawatir [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [13]:
example['labels']

[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]

In [14]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['antisipasi', 'optimisme', 'memercayai']

In [15]:
encoded_dataset.set_format("torch")

## Define model


In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the model!

In [17]:
batch_size = 8
metric_name = "f1"

In [18]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-indo",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to="none",
    fp16=True,
    #push_to_hub=True,
)

In [19]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [20]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [21]:
encoded_dataset['train']['input_ids'][0]

tensor([    2,  4079,   805,  2857,   647,   119,   746, 17900,  2523, 16772,
            7,  3933,  5158,  4079,     3,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [22]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.7088, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0368, -0.5629,  0.0130, -0.2670,  0.3499, -0.3690, -0.0226,  0.1000,
         -0.3795, -0.4256, -0.4143]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.4216,0.33598,0.647589,0.756965,0.26298
2,0.3054,0.332711,0.653179,0.761099,0.259594
3,0.2717,0.326685,0.67024,0.774526,0.257336


TrainOutput(global_step=2565, training_loss=0.32290644431904275, metrics={'train_runtime': 317.7843, 'train_samples_per_second': 64.553, 'train_steps_per_second': 8.072, 'total_flos': 4779555334414848.0, 'train_loss': 0.32290644431904275, 'epoch': 3.0})

## Evaluate

In [25]:
trainer.evaluate()

{'eval_loss': 0.3266851603984833,
 'eval_f1': 0.6702399183256764,
 'eval_roc_auc': 0.7745262468820893,
 'eval_accuracy': 0.25733634311512416,
 'eval_runtime': 3.207,
 'eval_samples_per_second': 276.274,
 'eval_steps_per_second': 34.612,
 'epoch': 3.0}

In [28]:
# Save the trained model to Google Drive
model_save_path = '/content/drive/MyDrive/Colab Notebooks/bert-large-p2-finetuned-sem_eval-indo.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/Colab Notebooks/bert-large-p2-finetuned-sem_eval-indo.pth


## Inference

In [27]:
text = "Dan apabila dikatakan kepada mereka, Marilah kepada apa yang telah diturunkan Allah dan kepada Rasul, kamu akan melihat orang-orang munafik berpaling darimu karena takut."

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

logits = outputs.logits
logits.shape
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['takut']
