In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("kilian-group/arxiv-classifier", "default")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Convert to pandas
train_df = ds["train"].to_pandas()
test_df  = ds["test"].to_pandas()

In [3]:
train_df = train_df[["field", "abstract"]]
test_df  = test_df[["field", "abstract"]]

In [4]:
train_df.head()

Unnamed: 0,field,abstract
0,cond-mat,An electric current controlled spin-wave logic...
1,cond-mat,We investigate nanoelectromechanical systems n...
2,cond-mat,We have investigated the polarization dependen...
3,cond-mat,The erasure of a classical bit of information ...
4,cond-mat,While mesoscopic conducting loops are sensitiv...


In [5]:
labels = sorted(train_df['field'].unique())
label2id = {lab: i for i, lab in enumerate(labels)}
id2label = {i: lab for lab, i in label2id.items()}
train_df['label_id'] = train_df['field'].map(label2id)

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [8]:
def preprocess(text):
  text = str(text).lower()

  tokens = word_tokenize(text)

  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

  return " ".join(tokens)

In [9]:
train_df["new_abstract"] = train_df["abstract"].apply(preprocess)

In [10]:
test_df["new_abstract"] = test_df["abstract"].apply(preprocess)

In [17]:
test_df['label_id'] = test_df['field'].map(label2id)

In [18]:
train_df.head()

Unnamed: 0,field,abstract,label_id,new_abstract
0,cond-mat,An electric current controlled spin-wave logic...,1,electric current controlled spin-wave logic ga...
1,cond-mat,We investigate nanoelectromechanical systems n...,1,investigate nanoelectromechanical system near ...
2,cond-mat,We have investigated the polarization dependen...,1,investigated polarization dependence generatio...
3,cond-mat,The erasure of a classical bit of information ...,1,erasure classical bit information dissipative ...
4,cond-mat,While mesoscopic conducting loops are sensitiv...,1,mesoscopic conducting loop sensitive external ...


In [19]:
test_df.head()

Unnamed: 0,field,abstract,new_abstract,label_id
0,cond-mat,Proximity-induced magnetic effects on the surf...,proximity-induced magnetic effect surface dira...,1
1,cond-mat,We present the first experimental microwave re...,present first experimental microwave realizati...,1
2,cond-mat,We report on the effect of the lateral confine...,report effect lateral confinement perpendicula...,1
3,cond-mat,Measurement of gravitational Hawking radiation...,measurement gravitational hawking radiation bl...,1
4,cond-mat,We study the non-equilibrium evolution of conc...,study non-equilibrium evolution concurrence be...,1


In [20]:
train_df.to_csv("train_df_new.csv", index=False)
test_df.to_csv("test_df_new.csv", index=False)

In [14]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
import torch, os




In [15]:
tokenizer_map = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased"
}

tokenizers = {k: AutoTokenizer.from_pretrained(v) for k, v in tokenizer_map.items()}

In [21]:
train_df.shape, test_df.shape

((108696, 4), (27178, 4))

In [22]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["new_abstract"].astype(str).values,
    train_df["label_id"].values,
    test_size=0.2,
    random_state=42,
    stratify=train_df["label_id"].values
)

In [23]:
import numpy as np

def tokenizer_stats(sample_texts, tokenizers, max_length=40, n_samples=1000):
    sample = list(sample_texts)[:n_samples]
    stats = {}
    for name, tok in tokenizers.items():
        counts = []
        n_trunc = 0
        for t in sample:
            enc = tok(t, add_special_tokens=True)
            length = len(enc["input_ids"])
            counts.append(length)
            if length > max_length:
                n_trunc += 1
        stats[name] = {
            "avg_tokens": float(np.mean(counts)),
            "median_tokens": float(np.median(counts)),
            "max_tokens": int(np.max(counts)),
            "min_tokens": int(np.min(counts)),
            "percent_truncated_at_{}": 100.0 * n_trunc / len(sample)
        }
    return stats

tok_stats = tokenizer_stats(train_texts, tokenizers, max_length=200000, n_samples=min(3000, len(train_texts)))
pd.set_option('display.width', 120)
print(pd.DataFrame(tok_stats).T)

Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


            avg_tokens  median_tokens  max_tokens  min_tokens  percent_truncated_at_{}
bert        142.904333          132.0       788.0         9.0                      0.0
roberta     141.450667          131.0       791.0         9.0                      0.0
distilbert  142.904333          132.0       788.0         9.0                      0.0


In [24]:
import datasets
from datasets import Dataset

def prepare_hf_dataset(texts, labels, tokenizer, max_length=40):
    ds = Dataset.from_dict({"text": list(texts), "label": list(labels)})
    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=max_length)
    ds = ds.map(tokenize_fn, batched=True)
    ds = ds.remove_columns([c for c in ds.column_names if c not in ("input_ids", "attention_mask", "label")])
    ds.set_format(type="torch")
    return ds

In [25]:
import os
import evaluate

metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    acc = metric_accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1w = metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    precw = metric_precision.compute(predictions=preds, references=labels, average="weighted")["precision"]
    recw = metric_recall.compute(predictions=preds, references=labels, average="weighted")["recall"]

    return {"accuracy": acc, "precision": precw, "recall": recw, "f1": f1w}

In [26]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

True
1
NVIDIA GeForce RTX 3060 Laptop GPU


In [27]:
models_to_train = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
}

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType
import torch, os

MAX_LEN = 256
BATCH_SIZE = 16
TRAIN_EPOCHS = 10
OUTPUT_BASE = "./hf_finetune_results_lora"
os.makedirs(OUTPUT_BASE, exist_ok=True)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"]
)

results = {}

for shortname, model_name in models_to_train.items():
    print(f"\n{'='*60}")
    print(f"Fine-tuning {shortname} with LoRA ({model_name})")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(labels)
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    ds_train = prepare_hf_dataset(train_texts, train_labels, tokenizer, max_length=MAX_LEN)
    ds_val   = prepare_hf_dataset(val_texts, val_labels, tokenizer, max_length=MAX_LEN)

    training_args = TrainingArguments(
        output_dir=os.path.join(OUTPUT_BASE, shortname),
        num_train_epochs=TRAIN_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="epoch",
        learning_rate=2e-4,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        metric_for_best_model="accuracy",
        load_best_model_at_end=True,
        greater_is_better=True,
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_res = trainer.evaluate()
    print(f"Eval results for {shortname}:", eval_res)

    save_dir = os.path.join(OUTPUT_BASE, shortname, "lora_model")
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"LoRA model saved to: {save_dir}")

    results[shortname] = {
        "model_name": model_name,
        "eval": eval_res,
        "save_dir": save_dir,
        "tokenizer": tokenizer
    }


Fine-tuning bert with LoRA (bert-base-uncased)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 605,204 || all params: 110,102,824 || trainable%: 0.5497


Map: 100%|██████████| 86956/86956 [00:25<00:00, 3451.14 examples/s]
Map: 100%|██████████| 21740/21740 [00:06<00:00, 3147.66 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8187,0.714693,0.764259,0.749723,0.764259,0.75132
2,0.6576,0.657484,0.778703,0.774051,0.778703,0.773024
3,0.5921,0.62692,0.786937,0.789634,0.786937,0.785747
4,0.5904,0.615041,0.79172,0.791565,0.79172,0.788816
5,0.5426,0.603629,0.798298,0.795184,0.798298,0.79348
6,0.5202,0.620735,0.801748,0.794798,0.801748,0.7948
7,0.4581,0.608116,0.800368,0.802468,0.800368,0.799877
8,0.4804,0.613065,0.80207,0.798134,0.80207,0.798755
9,0.4341,0.615886,0.803542,0.800311,0.803542,0.800609
10,0.4485,0.620778,0.804002,0.800638,0.804002,0.801487


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Eval results for bert: {'eval_loss': 0.6207776069641113, 'eval_accuracy': 0.804001839926403, 'eval_precision': 0.8006378045311333, 'eval_recall': 0.804001839926403, 'eval_f1': 0.8014869463100291, 'eval_runtime': 88.4036, 'eval_samples_per_second': 245.918, 'eval_steps_per_second': 15.373, 'epoch': 10.0}
LoRA model saved to: ./hf_finetune_results_lora\bert\lora_model

Fine-tuning roberta with LoRA (roberta-base)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,195,796 || all params: 125,856,808 || trainable%: 0.9501


Map: 100%|██████████| 86956/86956 [00:19<00:00, 4556.03 examples/s]
Map: 100%|██████████| 21740/21740 [00:04<00:00, 4534.62 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8107,0.68506,0.766513,0.75833,0.766513,0.753315
2,0.6804,0.64306,0.779853,0.776004,0.779853,0.775114
3,0.6309,0.61117,0.786569,0.791824,0.786569,0.786448
4,0.5982,0.603888,0.797148,0.794186,0.797148,0.792715
5,0.5649,0.577379,0.806348,0.799309,0.806348,0.80041
6,0.4683,0.585844,0.803082,0.797998,0.803082,0.796692
7,0.4693,0.579764,0.80483,0.803974,0.80483,0.80314


Eval results for roberta: {'eval_loss': 0.5773788094520569, 'eval_accuracy': 0.8063477460901564, 'eval_precision': 0.7993087919329801, 'eval_recall': 0.8063477460901564, 'eval_f1': 0.8004100734877582, 'eval_runtime': 87.3598, 'eval_samples_per_second': 248.856, 'eval_steps_per_second': 15.556, 'epoch': 7.0}
LoRA model saved to: ./hf_finetune_results_lora\roberta\lora_model


In [31]:
models_to_train_fine_tune = {
    "distilbert": "distilbert-base-uncased",
}

MAX_LEN = 200
BATCH_SIZE = 16
TRAIN_EPOCHS = 10
OUTPUT_BASE = "./hf_finetune_results"

In [32]:
results = {}

for shortname, model_name in models_to_train_fine_tune.items():
    print("\n" + "="*60)
    print(f"Fine-tuning {shortname} ({model_name})")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))
    ds_train = prepare_hf_dataset(train_texts, train_labels, tokenizer, max_length=MAX_LEN)
    ds_val = prepare_hf_dataset(val_texts, val_labels, tokenizer, max_length=MAX_LEN)

    training_args = TrainingArguments(
        output_dir=os.path.join(OUTPUT_BASE, shortname),
        num_train_epochs=TRAIN_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="no",
        learning_rate=2e-5,
        seed=42,
        load_best_model_at_end=False,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to=[],
        fp16=torch.cuda.is_available()
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_res = trainer.evaluate()
    print(f"Eval results for {shortname}:", eval_res)

    save_dir = os.path.join(OUTPUT_BASE, shortname, "saved_model")
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

    print(f"Model saved to: {save_dir}")
    results[shortname] = {"model_name": model_name, "eval": eval_res, "tokenizer": tokenizer, "trainer": trainer}


Fine-tuning distilbert (distilbert-base-uncased)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 86956/86956 [00:23<00:00, 3689.61 examples/s]
Map: 100%|██████████| 21740/21740 [00:05<00:00, 3842.59 examples/s]
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6938,0.623146,0.789604,0.782832,0.789604,0.78306
2,0.521,0.582351,0.803358,0.804867,0.803358,0.801852
3,0.3881,0.602608,0.807728,0.809451,0.807728,0.807187
4,0.3163,0.696542,0.806394,0.804799,0.806394,0.804333
5,0.2231,0.836809,0.80598,0.803361,0.80598,0.803787


Eval results for distilbert: {'eval_loss': 0.8368091583251953, 'eval_accuracy': 0.8059797608095676, 'eval_precision': 0.8033607066517179, 'eval_recall': 0.8059797608095676, 'eval_f1': 0.8037871826010916, 'eval_runtime': 32.1745, 'eval_samples_per_second': 675.691, 'eval_steps_per_second': 42.238, 'epoch': 5.0}
Model saved to: ./hf_finetune_results\distilbert\saved_model


In [35]:
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

MAX_LEN = 200
FINETUNE_BASE = "./hf_finetune_results"
LORA_BASE = "./hf_finetune_results_lora"
BATCH_SIZE = 16

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

def load_model_auto(shortname, base_model_name):
    finetune_path = os.path.join(FINETUNE_BASE, shortname, "saved_model")
    lora_path = os.path.join(LORA_BASE, shortname, "lora_model")

    if os.path.exists(lora_path):
        print(f"[PEFT] Loading LoRA model for {shortname}")
        base = AutoModelForSequenceClassification.from_pretrained(
            base_model_name,
            num_labels=len(labels)
        )
        model = PeftModel.from_pretrained(base, lora_path)
        tokenizer = AutoTokenizer.from_pretrained(lora_path)

    elif os.path.exists(finetune_path):
        print(f"[FINETUNE] Loading fine-tuned model for {shortname}")
        tokenizer = AutoTokenizer.from_pretrained(finetune_path)
        model = AutoModelForSequenceClassification.from_pretrained(finetune_path)

    else:
        raise FileNotFoundError(
            f"Tidak menemukan model {shortname}\nDicari di:\n{finetune_path}\n{lora_path}"
        )

    model.to(device)
    model.eval()
    return tokenizer, model

def predict_batch(tokenizer, model, texts):
    all_preds = []
    for i in range(0, len(texts), BATCH_SIZE):
        batch_texts = texts[i:i+BATCH_SIZE]
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=MAX_LEN
        )
        inputs = {k:v.to(device) for k,v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_preds = torch.argmax(outputs.logits, dim=1).tolist()
        all_preds.extend(batch_preds)
    return all_preds

def evaluate_model(tokenizer, model, df, text_col="abstract", label_col="field"):
    true_labels = df[label_col].tolist()
    pred_ids = predict_batch(tokenizer, model, df[text_col].tolist())
    pred_labels = [id2label[i] for i in pred_ids]

    df["predicted"] = pred_labels
    true_ids = [label2id[l] for l in true_labels]

    accuracy = accuracy_score(true_ids, pred_ids)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_ids, pred_ids, average="weighted"
    )

    report = classification_report(true_ids, pred_ids, target_names=labels)
    return df, accuracy, precision, recall, f1, report

def evaluate_models(df, models_to_train, text_col="abstract", label_col="field", save_folder="predictions"):
    os.makedirs(save_folder, exist_ok=True)
    metrics_summary = []

    for shortname, base_model_name in models_to_train.items():
        print(f"\n=== Evaluating model: {shortname} ({base_model_name}) ===")
        tokenizer, model = load_model_auto(shortname, base_model_name)
        pred_df, acc, prec, rec, f1, report = evaluate_model(
            tokenizer, model, df.copy(), text_col=text_col, label_col=label_col
        )

        save_path = os.path.join(save_folder, f"{shortname}_pred.csv")
        pred_df.to_csv(save_path, index=False)

        print(f"\n>>> Evaluation for {shortname}")
        print(f"Accuracy:  {acc:.4f}")
        print(f"Precision: {prec:.4f}")
        print(f"Recall:    {rec:.4f}")
        print(f"F1-score:  {f1:.4f}")
        print("\nClassification Report:")
        print(report)

        metrics_summary.append({
            "model": shortname,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1": f1
        })

    return pd.DataFrame(metrics_summary)

models_to_train = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased",
}

summary_df = evaluate_models(
    test_df,
    models_to_train,
    text_col="abstract",
    label_col="field"
)

summary_df


Using device: cuda

=== Evaluating model: bert (bert-base-uncased) ===
[PEFT] Loading LoRA model for bert


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



>>> Evaluation for bert
Accuracy:  0.7959
Precision: 0.7926
Recall:    0.7959
F1-score:  0.7920

Classification Report:
              precision    recall  f1-score   support

    astro-ph       0.92      0.86      0.89      1397
    cond-mat       0.77      0.78      0.78      1800
          cs       0.82      0.88      0.85      6973
        econ       0.53      0.08      0.15       107
        eess       0.51      0.39      0.44       747
       gr-qc       0.57      0.67      0.61       200
      hep-ex       0.90      0.57      0.70       200
     hep-lat       0.79      0.86      0.83       200
      hep-ph       0.58      0.62      0.60       200
      hep-th       0.80      0.53      0.64       200
        math       0.89      0.88      0.89      5800
     math-ph       0.24      0.15      0.19       200
        nlin       0.66      0.70      0.68       876
     nucl-ex       0.72      0.40      0.51       200
     nucl-th       0.56      0.62      0.59       200
     physics  

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



>>> Evaluation for roberta
Accuracy:  0.7903
Precision: 0.7847
Recall:    0.7903
F1-score:  0.7847

Classification Report:
              precision    recall  f1-score   support

    astro-ph       0.89      0.88      0.89      1397
    cond-mat       0.77      0.71      0.74      1800
          cs       0.83      0.87      0.85      6973
        econ       0.69      0.10      0.18       107
        eess       0.55      0.38      0.45       747
       gr-qc       0.57      0.64      0.60       200
      hep-ex       0.77      0.65      0.71       200
     hep-lat       0.89      0.73      0.80       200
      hep-ph       0.69      0.47      0.56       200
      hep-th       0.66      0.62      0.64       200
        math       0.86      0.89      0.88      5800
     math-ph       0.33      0.07      0.11       200
        nlin       0.70      0.60      0.64       876
     nucl-ex       0.66      0.55      0.60       200
     nucl-th       0.57      0.70      0.63       200
     physic

Unnamed: 0,model,accuracy,precision,recall,f1
0,bert,0.795938,0.792639,0.795938,0.792033
1,roberta,0.790345,0.78468,0.790345,0.784674
2,distilbert,0.787843,0.787461,0.787843,0.784635
