# dataset

In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate

In [2]:
dataset = load_dataset("uitnlp/vietnamese_students_feedback")

dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 3166
    })
})

In [3]:
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_function(examples):
    tokenized = tokenizer(examples["sentence"], truncation=True, max_length=256)
    tokenized['labels'] = examples["sentiment"]
    return tokenized

encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["sentence", "sentiment", "topic"])

In [4]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3166
    })
})

# Train

In [5]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [6]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="weighted")["recall"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

In [8]:
training_args = TrainingArguments(
    output_dir="phobert-vn-student-feedback-sentiment",     
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    metric_for_best_model="eval_loss",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    weight_decay=0.05,
    save_total_limit=1,
    bf16=True,
    report_to="none",
    lr_scheduler_type="cosine"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4461,0.265263,0.917246,0.875346,0.917246,0.895614
2,0.2248,0.202529,0.935565,0.932736,0.935565,0.9334
3,0.1747,0.195603,0.942514,0.93937,0.942514,0.94036
4,0.1432,0.205318,0.939356,0.936007,0.939356,0.935709
5,0.114,0.20803,0.938092,0.934747,0.938092,0.935575
6,0.0973,0.219273,0.943146,0.940211,0.943146,0.940231
7,0.084,0.22517,0.941883,0.939411,0.941883,0.940098
8,0.0752,0.231822,0.943146,0.940879,0.943146,0.940506
9,0.0664,0.23523,0.942514,0.93989,0.942514,0.939735
10,0.0668,0.233579,0.943778,0.941315,0.943778,0.941191


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=900, training_loss=0.14925370163387722, metrics={'train_runtime': 138.0976, 'train_samples_per_second': 827.386, 'train_steps_per_second': 6.517, 'total_flos': 4001831749228320.0, 'train_loss': 0.14925370163387722, 'epoch': 10.0})

In [11]:
torch.cuda.empty_cache()

In [12]:
trainer.push_to_hub("binhphap5/phobert-vn-student-feedback-sentiment")

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/binhphap5/phobert-vn-student-feedback-sentiment/commit/158982b9d1ff9627d232b226de54cae5db91e218', commit_message='binhphap5/phobert-vn-student-feedback-sentiment', commit_description='', oid='158982b9d1ff9627d232b226de54cae5db91e218', pr_url=None, repo_url=RepoUrl('https://huggingface.co/binhphap5/phobert-vn-student-feedback-sentiment', endpoint='https://huggingface.co', repo_type='model', repo_id='binhphap5/phobert-vn-student-feedback-sentiment'), pr_revision=None, pr_num=None)