In [1]:
import json
from datasets import load_from_disk

dataset_train_loaded = load_from_disk("./topic_classification/train_dataset")
dataset_test_loaded = load_from_disk("./topic_classification/test_dataset")

print("Dataset Train loaded:", dataset_train_loaded)
print("Dataset Test loaded:", dataset_test_loaded)

with open("label_mapping.json", "r") as f:
    label_mapping_loaded = json.load(f)

print("Label Mapping loaded:", label_mapping_loaded)

with open("class_weight.json", "r") as f:
    class_weight_loaded = json.load(f)

class_weight_dict = {int(k): v for k, v in class_weight_loaded.items()}

print("Class Weights loaded (re-casted):", class_weight_dict)

  from .autonotebook import tqdm as notebook_tqdm


Dataset Train loaded: Dataset({
    features: ['final_text', 'label'],
    num_rows: 86941
})
Dataset Test loaded: Dataset({
    features: ['final_text', 'label'],
    num_rows: 21736
})
Label Mapping loaded: {'astro-ph': 0, 'cond-mat': 1, 'cs': 2, 'econ': 3, 'eess': 4, 'gr-qc': 5, 'hep-ex': 6, 'hep-lat': 7, 'hep-ph': 8, 'hep-th': 9, 'math': 10, 'math-ph': 11, 'nlin': 12, 'nucl-ex': 13, 'nucl-th': 14, 'physics': 15, 'q-bio': 16, 'q-fin': 17, 'quant-ph': 18, 'stat': 19}
Class Weights loaded (re-casted): {0: 0.9724944071588367, 1: 0.7553518679409209, 2: 0.19485633600788918, 3: 12.673615160349854, 4: 1.8127814845704755, 5: 6.792265625, 6: 6.792265625, 7: 6.792265625, 8: 6.813557993730408, 9: 6.792265625, 10: 0.2343423180592992, 11: 6.792265625, 12: 1.552517857142857, 13: 6.802895148669797, 14: 6.792265625, 15: 0.3266003005259204, 16: 0.8681945276612743, 17: 1.1803013847407005, 18: 6.802895148669797, 19: 1.69806640625}


In [2]:
import pandas as pd

print("\n5 Data Pertama (Train)")
df_sample_train = pd.DataFrame(dataset_train_loaded[:5])
display(df_sample_train)

print("\n5 Data Pertama (Test)")
df_sample_test = pd.DataFrame(dataset_test_loaded[:5])
display(df_sample_test)


5 Data Pertama (Train)


Unnamed: 0,final_text,label
0,ante forecast outcome interpreted counterfactu...,17
1,gaming customizing individual character create...,2
2,framework european research project meteomet l...,15
3,software product quality defined feature chara...,2
4,optimizing communication imperative large scal...,2



5 Data Pertama (Test)


Unnamed: 0,final_text,label
0,work propose use dropout bayesian estimator in...,4
1,given simple polygon [eq] consisting [eq] vert...,2
2,pattern stored within pre trained deep neural ...,2
3,ride sharing service gaining popularity crucia...,2
4,perform experiment phase simulation ring netwo...,1


In [3]:
from datasets import ClassLabel

num_classes = len(label_mapping_loaded)

dataset_train_loaded = dataset_train_loaded.cast_column(
    "label", 
    ClassLabel(num_classes=num_classes)
)

print("Tipe kolom label sekarang:", dataset_train_loaded.features['label'])

split_result = dataset_train_loaded.train_test_split(
    test_size=0.2, 
    seed=42, 
    stratify_by_column="label"
)

dataset_train_final = split_result['train']
dataset_val_final = split_result['test']

print("="*30)
print(f"Original Train: {len(dataset_train_loaded)}")
print(f"New Train (80%): {len(dataset_train_final)}")
print(f"New Val   (20%): {len(dataset_val_final)}")
print(f"Original Test : {len(dataset_test_loaded)}")
print("="*30)

Tipe kolom label sekarang: ClassLabel(names=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19'])
Original Train: 86941
New Train (80%): 69552
New Val   (20%): 17389
Original Test : 21736


In [4]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
import torch, os




In [5]:
tokenizer_map = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased"
}

tokenizers = {k: AutoTokenizer.from_pretrained(v) for k, v in tokenizer_map.items()}

In [6]:
target_modules_map = {
    "bert": ["query", "value"],
    "roberta": ["query", "value"],
    "distilbert": ["q_lin", "v_lin"]
}

In [7]:
models_to_train = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased",
}

In [8]:
weights_list = [class_weight_loaded[str(i)] for i in range(num_classes)]
weights_tensor = torch.tensor(weights_list, dtype=torch.float32)

In [9]:
print("Weights Tensor loaded:", weights_tensor)

Weights Tensor loaded: tensor([ 0.9725,  0.7554,  0.1949, 12.6736,  1.8128,  6.7923,  6.7923,  6.7923,
         6.8136,  6.7923,  0.2343,  6.7923,  1.5525,  6.8029,  6.7923,  0.3266,
         0.8682,  1.1803,  6.8029,  1.6981])


In [10]:
from torch import nn

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        if self.class_weights.device != model.device:
            self.class_weights = self.class_weights.to(model.device)
            
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, TaskType
import torch, os
import gc

MAX_LEN = 256
BATCH_SIZE = 16
TRAIN_EPOCHS = 15
OUTPUT_BASE = "./hf_finetune_results_lora_baru"
os.makedirs(OUTPUT_BASE, exist_ok=True)

results = {}

for shortname, model_name in models_to_train.items():
    print(f"\n{'='*60}")
    print(f"Fine-tuning {shortname} with LoRA ({model_name})")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        target_modules=target_modules_map[shortname] 
    )

    num_labels = dataset_train_final.features['label'].num_classes
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    def tokenize_function(examples):
        return tokenizer(examples["final_text"], truncation=True, padding="max_length", max_length=MAX_LEN)

    print("Tokenizing datasets...")
    tokenized_train = dataset_train_final.map(tokenize_function, batched=True)
    tokenized_val = dataset_val_final.map(tokenize_function, batched=True)

    cols_to_keep = ["input_ids", "attention_mask", "label"]
    tokenized_train = tokenized_train.remove_columns([c for c in tokenized_train.column_names if c not in cols_to_keep])
    tokenized_val = tokenized_val.remove_columns([c for c in tokenized_val.column_names if c not in cols_to_keep])
    
    tokenized_train.set_format("torch")
    tokenized_val.set_format("torch")

    training_args = TrainingArguments(
        output_dir=os.path.join(OUTPUT_BASE, shortname),
        num_train_epochs=TRAIN_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="epoch",
        learning_rate=2e-4,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        metric_for_best_model="accuracy",
        load_best_model_at_end=True,
        greater_is_better=True,
        report_to=[]
    )

    # trainer = WeightedTrainer(
    #     class_weights=weights_tensor,
    #     model=model,
    #     args=training_args,
    #     train_dataset=tokenized_train,
    #     eval_dataset=tokenized_val,
    #     tokenizer=tokenizer,
    #     compute_metrics=compute_metrics,
    #     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    # )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()
    eval_res = trainer.evaluate()
    print(f"Eval results for {shortname}:", eval_res)

    save_dir = os.path.join(OUTPUT_BASE, shortname, "lora_model")
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"LoRA model saved to: {save_dir}")

    results[shortname] = {
        "model_name": model_name,
        "eval": eval_res,
        "save_dir": save_dir
    }

    del model
    del trainer
    del tokenizer

    gc.collect()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
    print(f"Memory cleaned. Moving to next model...\n")


Fine-tuning bert with LoRA (bert-base-uncased)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


trainable params: 605,204 || all params: 110,102,824 || trainable%: 0.5497
Tokenizing datasets...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7155,0.74963,0.754845,0.739696
2,0.7505,0.678268,0.777388,0.76909
3,0.6215,0.660733,0.778883,0.776652
4,0.6383,0.642162,0.787107,0.7764
5,0.5674,0.622501,0.793893,0.788006
6,0.54,0.628467,0.791305,0.781334
7,0.5533,0.634377,0.78935,0.787166
8,0.5675,0.640683,0.797228,0.792306
9,0.4441,0.652289,0.79142,0.789576
10,0.4415,0.653185,0.799471,0.795319


Eval results for bert: {'eval_loss': 0.6669399738311768, 'eval_accuracy': 0.8001035137155673, 'eval_f1': 0.7938357584784669, 'eval_runtime': 69.0177, 'eval_samples_per_second': 251.95, 'eval_steps_per_second': 15.75, 'epoch': 14.0}
LoRA model saved to: ./hf_finetune_results_lora_baru\bert\lora_model
Memory cleaned. Moving to next model...


Fine-tuning roberta with LoRA (roberta-base)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,195,796 || all params: 125,856,808 || trainable%: 0.9501
Tokenizing datasets...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 69552/69552 [00:09<00:00, 7362.12 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 17389/17389 [00:02<00:00, 7901.51 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7064,0.706525,0.762033,0.751367
2,0.7553,0.657006,0.776985,0.770139
3,0.5816,0.658303,0.774628,0.774339
4,0.6229,0.628491,0.786934,0.780786
5,0.5842,0.602436,0.79602,0.790673
6,0.5326,0.611981,0.796135,0.788371
7,0.5495,0.615564,0.79349,0.791946
8,0.5395,0.618119,0.800391,0.796021
9,0.4392,0.629794,0.794525,0.793502
10,0.4176,0.623446,0.800219,0.797508


Eval results for roberta: {'eval_loss': 0.6181185841560364, 'eval_accuracy': 0.8003910518143654, 'eval_f1': 0.7960207895554434, 'eval_runtime': 68.7845, 'eval_samples_per_second': 252.804, 'eval_steps_per_second': 15.803, 'epoch': 11.0}
LoRA model saved to: ./hf_finetune_results_lora_baru\roberta\lora_model
Memory cleaned. Moving to next model...


Fine-tuning distilbert with LoRA (distilbert-base-uncased)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 900,884 || all params: 67,869,736 || trainable%: 1.3274
Tokenizing datasets...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 69552/69552 [00:09<00:00, 6978.85 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 17389/17389 [00:02<00:00, 7642.92 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6475,0.680203,0.769222,0.761052
2,0.6842,0.628379,0.786244,0.779889
3,0.5407,0.624778,0.786934,0.785292
4,0.6115,0.597923,0.796135,0.788947
5,0.5563,0.586558,0.801886,0.797399
6,0.4786,0.602516,0.800391,0.793325
7,0.4885,0.609154,0.799701,0.795568
8,0.46,0.625849,0.801081,0.797072


Eval results for distilbert: {'eval_loss': 0.5865576267242432, 'eval_accuracy': 0.8018862499281155, 'eval_f1': 0.7973991076830566, 'eval_runtime': 36.9492, 'eval_samples_per_second': 470.619, 'eval_steps_per_second': 29.419, 'epoch': 8.0}
LoRA model saved to: ./hf_finetune_results_lora_baru\distilbert\lora_model
Memory cleaned. Moving to next model...



In [None]:
# Test

import torch
import os
import json
import numpy as np
import gc
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from peft import PeftModel, PeftConfig
from datasets import load_from_disk, ClassLabel
from sklearn.metrics import accuracy_score, f1_score, classification_report

OUTPUT_BASE = "./hf_finetune_results_lora_baru"
MAX_LEN = 256
BATCH_SIZE = 16
models_to_test = ["bert", "roberta", "distilbert"]

print("Loading Test Dataset...")
dataset_test_loaded = load_from_disk("./topic_classification/test_dataset")

with open("label_mapping.json", "r") as f:
    label_mapping = json.load(f)

num_labels_fixed = len(label_mapping)
print(f"Number of labels identified: {num_labels_fixed}")

dataset_test_loaded = dataset_test_loaded.cast_column(
    "label", 
    ClassLabel(num_classes=num_labels_fixed)
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

test_results = {}

for shortname in models_to_test:
    print(f"\n{'='*60}")
    print(f"TESTING MODEL: {shortname}")
    
    adapter_path = os.path.join(OUTPUT_BASE, shortname, "lora_model")
    
    if not os.path.exists(adapter_path):
        print(f"Path not found: {adapter_path}")
        continue

    peft_config = PeftConfig.from_pretrained(adapter_path)
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    
    base_model = AutoModelForSequenceClassification.from_pretrained(
        peft_config.base_model_name_or_path,
        num_labels=num_labels_fixed, 
        ignore_mismatched_sizes=True
    )
    
    model = PeftModel.from_pretrained(base_model, adapter_path)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    
    print(f"Tokenizing test data for {shortname}...")
    def tokenize_fn(examples):
        return tokenizer(examples["final_text"], truncation=True, padding="max_length", max_length=MAX_LEN)
    
    test_tokenized = dataset_test_loaded.map(tokenize_fn, batched=True)
    
    cols_to_keep = ["input_ids", "attention_mask", "label"]
    test_tokenized = test_tokenized.remove_columns([c for c in test_tokenized.column_names if c not in cols_to_keep])
    test_tokenized.set_format("torch")
    
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    print(f"Running prediction on {len(test_tokenized)} samples...")
    predictions_output = trainer.predict(test_tokenized)
    
    metrics = predictions_output.metrics
    
    y_preds = np.argmax(predictions_output.predictions, axis=1)
    y_true = predictions_output.label_ids
    
    print(f"\nRESULT FOR {shortname.upper()}:")
    print(f"Accuracy: {metrics['test_accuracy']:.4f}")
    print(f"F1 Score: {metrics['test_f1']:.4f}")
    print("-" * 30)
    
    target_names = [k for k, v in sorted(label_mapping.items(), key=lambda item: item[1])]
    
    print("Classification Report:")
    print(classification_report(y_true, y_preds, target_names=target_names))
    
    test_results[shortname] = metrics

    del model
    del base_model
    del trainer
    del tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("\nAll Testing Finished!")
print(test_results)

Loading Test Dataset...
Number of labels identified: 20


Casting the dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21736/21736 [00:00<00:00, 724499.48 examples/s]


ðŸ§ª TESTING MODEL: bert



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing test data for bert...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21736/21736 [00:03<00:00, 6746.94 examples/s]
  trainer = Trainer(


Running prediction on 21736 samples...



ðŸ“Š RESULT FOR BERT:
Accuracy: 0.8019
F1 Score: 0.7956
------------------------------
Classification Report:
              precision    recall  f1-score   support

    astro-ph       0.87      0.90      0.89      1117
    cond-mat       0.76      0.78      0.77      1439
          cs       0.82      0.89      0.85      5578
        econ       0.35      0.16      0.22        86
        eess       0.56      0.34      0.42       599
       gr-qc       0.68      0.54      0.60       160
      hep-ex       0.80      0.76      0.78       160
     hep-lat       0.78      0.85      0.81       160
      hep-ph       0.70      0.61      0.65       160
      hep-th       0.68      0.58      0.63       160
        math       0.88      0.90      0.89      4637
     math-ph       0.37      0.12      0.19       160
        nlin       0.68      0.66      0.67       700
     nucl-ex       0.71      0.65      0.68       160
     nucl-th       0.59      0.57      0.58       160
     physics       0.78 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing test data for roberta...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21736/21736 [00:02<00:00, 8803.52 examples/s]
  trainer = Trainer(


Running prediction on 21736 samples...



ðŸ“Š RESULT FOR ROBERTA:
Accuracy: 0.7991
F1 Score: 0.7950
------------------------------
Classification Report:
              precision    recall  f1-score   support

    astro-ph       0.88      0.90      0.89      1117
    cond-mat       0.75      0.77      0.76      1439
          cs       0.83      0.87      0.85      5578
        econ       0.54      0.29      0.38        86
        eess       0.51      0.43      0.47       599
       gr-qc       0.57      0.79      0.66       160
      hep-ex       0.81      0.74      0.77       160
     hep-lat       0.85      0.80      0.83       160
      hep-ph       0.66      0.58      0.62       160
      hep-th       0.71      0.59      0.64       160
        math       0.88      0.90      0.89      4637
     math-ph       0.35      0.09      0.14       160
        nlin       0.64      0.70      0.67       700
     nucl-ex       0.66      0.65      0.66       160
     nucl-th       0.58      0.61      0.60       160
     physics       0.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing test data for distilbert...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21736/21736 [00:02<00:00, 8023.84 examples/s]
  trainer = Trainer(


Running prediction on 21736 samples...



ðŸ“Š RESULT FOR DISTILBERT:
Accuracy: 0.8026
F1 Score: 0.7985
------------------------------
Classification Report:
              precision    recall  f1-score   support

    astro-ph       0.89      0.91      0.90      1117
    cond-mat       0.80      0.71      0.75      1439
          cs       0.84      0.86      0.85      5578
        econ       0.57      0.29      0.38        86
        eess       0.55      0.42      0.48       599
       gr-qc       0.62      0.63      0.63       160
      hep-ex       0.72      0.82      0.76       160
     hep-lat       0.81      0.78      0.79       160
      hep-ph       0.58      0.69      0.63       160
      hep-th       0.68      0.60      0.64       160
        math       0.88      0.91      0.89      4637
     math-ph       0.36      0.17      0.23       160
        nlin       0.66      0.67      0.67       700
     nucl-ex       0.68      0.58      0.63       160
     nucl-th       0.71      0.51      0.59       160
     physics      