In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("kilian-group/arxiv-classifier", "default")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Convert to pandas
train_df = ds["train"].to_pandas()
test_df  = ds["test"].to_pandas()

# Stratify by column "field"
train_small_df, _ = train_test_split(
    train_df,
    train_size=5000,
    stratify=train_df["field"],
    random_state=42
)

test_small_df, _ = train_test_split(
    test_df,
    train_size=1000,
    stratify=test_df["field"],
    random_state=42
)

In [3]:
train_small_df = train_small_df[["field", "fulltext"]]
test_small_df  = test_small_df[["field", "fulltext"]]

In [4]:
train_small_df.head()

Unnamed: 0,field,fulltext
35286,math,\nA new fusion procedure for the Brauer algebr...
88609,cond-mat,Neural-network quantum state tomography for ma...
42384,q-bio,A little walk from physical to biological comp...
23420,cs,\nFrom individual to population:\nChallenges i...
100500,physics,Acoustic flat lensing using an indefinite medi...


In [5]:
labels = sorted(train_small_df['field'].unique())
label2id = {lab: i for i, lab in enumerate(labels)}
id2label = {i: lab for lab, i in label2id.items()}
train_small_df['label_id'] = train_small_df['field'].map(label2id)

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [8]:
def preprocess(text):
  text = str(text).lower()

  tokens = word_tokenize(text)

  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

  return " ".join(tokens)

In [9]:
train_small_df["new_full_text"] = train_small_df["fulltext"].apply(preprocess)

In [10]:
import transformers, torch
print(transformers.__version__)
print(torch.__version__)

4.57.1
2.5.1+cu121


In [12]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
import torch, os




In [None]:
tokenizer_map = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased"
}

tokenizers = {k: AutoTokenizer.from_pretrained(v) for k, v in tokenizer_map.items()}

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_small_df["new_full_text"].astype(str).values,
    train_small_df["label_id"].values,
    test_size=0.2,
    random_state=42,
    stratify=train_small_df["label_id"].values
)

In [15]:
import numpy as np

def tokenizer_stats(sample_texts, tokenizers, max_length=40, n_samples=1000):
    sample = list(sample_texts)[:n_samples]
    stats = {}
    for name, tok in tokenizers.items():
        counts = []
        n_trunc = 0
        for t in sample:
            enc = tok(t, add_special_tokens=True)
            length = len(enc["input_ids"])
            counts.append(length)
            if length > max_length:
                n_trunc += 1
        stats[name] = {
            "avg_tokens": float(np.mean(counts)),
            "median_tokens": float(np.median(counts)),
            "max_tokens": int(np.max(counts)),
            "min_tokens": int(np.min(counts)),
            "percent_truncated_at_{}": 100.0 * n_trunc / len(sample)
        }
    return stats

tok_stats = tokenizer_stats(train_texts, tokenizers, max_length=200000, n_samples=min(3000, len(train_texts)))
pd.set_option('display.width', 120)
print(pd.DataFrame(tok_stats).T)

Token indices sequence length is longer than the specified maximum sequence length for this model (15472 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (15523 > 512). Running this sequence through the model will result in indexing errors


           avg_tokens  median_tokens  max_tokens  min_tokens  percent_truncated_at_{}
bert     12098.378667         9421.5    308821.0         2.0                 0.066667
roberta  12753.700667         9856.0    321580.0       436.0                 0.100000
deberta  12753.700667         9856.0    321580.0       436.0                 0.100000


In [16]:
import datasets
from datasets import Dataset

def prepare_hf_dataset(texts, labels, tokenizer, max_length=40):
    ds = Dataset.from_dict({"text": list(texts), "label": list(labels)})
    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=max_length)
    ds = ds.map(tokenize_fn, batched=True)
    ds = ds.remove_columns([c for c in ds.column_names if c not in ("input_ids", "attention_mask", "label")])
    ds.set_format(type="torch")
    return ds

In [32]:
models_to_train = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
}

MAX_LEN = 200
BATCH_SIZE = 16
TRAIN_EPOCHS = 1
OUTPUT_BASE = "./hf_finetune_results"

In [22]:
import os
import evaluate

os.makedirs(OUTPUT_BASE, exist_ok=True)

metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    acc = metric_accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1w = metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    precw = metric_precision.compute(predictions=preds, references=labels, average="weighted")["precision"]
    recw = metric_recall.compute(predictions=preds, references=labels, average="weighted")["recall"]

    return {"accuracy": acc, "precision": precw, "recall": recw, "f1": f1w}

In [23]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

True
1
NVIDIA GeForce RTX 3060 Laptop GPU


In [24]:
results = {}

for shortname, model_name in models_to_train.items():
    print("\n" + "="*60)
    print(f"Fine-tuning {shortname} ({model_name})")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))
    ds_train = prepare_hf_dataset(train_texts, train_labels, tokenizer, max_length=MAX_LEN)
    ds_val = prepare_hf_dataset(val_texts, val_labels, tokenizer, max_length=MAX_LEN)

    training_args = TrainingArguments(
        output_dir=os.path.join(OUTPUT_BASE, shortname),
        num_train_epochs=TRAIN_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="no",
        learning_rate=2e-5,
        seed=42,
        load_best_model_at_end=False,
        report_to=[],
        fp16=torch.cuda.is_available()
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_res = trainer.evaluate()
    print(f"Eval results for {shortname}:", eval_res)

    save_dir = os.path.join(OUTPUT_BASE, shortname, "saved_model")
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

    print(f"Model saved to: {save_dir}")
    results[shortname] = {"model_name": model_name, "eval": eval_res, "tokenizer": tokenizer, "trainer": trainer}


Fine-tuning bert (bert-base-uncased)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 4000/4000 [01:33<00:00, 42.93 examples/s]
Map: 100%|██████████| 1000/1000 [00:25<00:00, 39.00 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2651,1.184704,0.721,0.622345,0.721,0.665635


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Eval results for bert: {'eval_loss': 1.1847039461135864, 'eval_accuracy': 0.721, 'eval_precision': 0.62234500975747, 'eval_recall': 0.721, 'eval_f1': 0.6656353230281403, 'eval_runtime': 2.7989, 'eval_samples_per_second': 357.277, 'eval_steps_per_second': 22.508, 'epoch': 1.0}
Model saved to: ./hf_finetune_results\bert\saved_model

Fine-tuning roberta (roberta-base)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 4000/4000 [00:39<00:00, 101.96 examples/s]
Map: 100%|██████████| 1000/1000 [00:12<00:00, 81.36 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4003,1.346017,0.627,0.554996,0.627,0.565865


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Eval results for roberta: {'eval_loss': 1.3460173606872559, 'eval_accuracy': 0.627, 'eval_precision': 0.554995664531772, 'eval_recall': 0.627, 'eval_f1': 0.5658652632064733, 'eval_runtime': 3.2183, 'eval_samples_per_second': 310.724, 'eval_steps_per_second': 19.576, 'epoch': 1.0}
Model saved to: ./hf_finetune_results\roberta\saved_model

Fine-tuning distilbert (distilbert-base-uncased)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 4000/4000 [01:25<00:00, 46.84 examples/s]
Map: 100%|██████████| 1000/1000 [00:27<00:00, 36.99 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3888,1.322352,0.674,0.581264,0.674,0.60825


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Eval results for distilbert: {'eval_loss': 1.3223522901535034, 'eval_accuracy': 0.674, 'eval_precision': 0.5812637288651479, 'eval_recall': 0.674, 'eval_f1': 0.6082499699791217, 'eval_runtime': 1.9651, 'eval_samples_per_second': 508.885, 'eval_steps_per_second': 32.06, 'epoch': 1.0}
Model saved to: ./hf_finetune_results\distilbert\saved_model


In [25]:
rows = []
for k, v in results.items():
    ev = v["eval"]
    rows.append({
        "model": k,
        "accuracy": ev.get("eval_accuracy", ev.get("accuracy")),
        "precision": ev.get("eval_precision", ev.get("precision")),
        "recall": ev.get("eval_recall", ev.get("recall")),
        "f1": ev.get("eval_f1", ev.get("f1"))
    })
df_comp = pd.DataFrame(rows).sort_values("f1", ascending=False).reset_index(drop=True)
print("\nModel comparison on validation set:")
print(df_comp)


Model comparison on validation set:
        model  accuracy  precision  recall        f1
0        bert     0.721   0.622345   0.721  0.665635
1  distilbert     0.674   0.581264   0.674  0.608250
2     roberta     0.627   0.554996   0.627  0.565865


In [26]:
def load_trained_model(path):
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    return tokenizer, model

In [27]:
save_dir = os.path.join(OUTPUT_BASE, "bert", "saved_model")

tokenizer, model = load_trained_model(save_dir)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [28]:
def predict_text(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    with torch.no_grad():
        outputs = model(**inputs)
    pred_class = torch.argmax(outputs.logits, dim=1).item()
    return id2label[pred_class]

example_text = "AUTOMORPHISMS AND IDEALS OF NONCOMMUTATIVE DEF..."
print("Predicted label:", predict_text(example_text))

Predicted label: math


In [29]:
test_small_df.head()

Unnamed: 0,field,fulltext
3401,math,AUTOMORPHISMS AND IDEALS OF NONCOMMUTATIVE DEF...
4490,math,\nSOME WEAK VERSIONS OF THE M1-SPACES\n\nFUCAI...
23226,math,\nNoname manuscript No.\n(will be inserted by ...
16858,quant-ph,\nA complicated Duffing oscillator in the surf...
18797,cs,1\nInternational Journal of Scient ific & E...


In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType
import torch, os

# ============================================================
# Config
# ============================================================
MAX_LEN = 200
BATCH_SIZE = 16
TRAIN_EPOCHS = 3
OUTPUT_BASE = "./hf_finetune_results_lora"
os.makedirs(OUTPUT_BASE, exist_ok=True)

# ============================================================
# LoRA Config
# ============================================================
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"]
)

# ============================================================
# Loop training
# ============================================================
results = {}

for shortname, model_name in models_to_train.items():
    print(f"\n{'='*60}")
    print(f"Fine-tuning {shortname} with LoRA ({model_name})")

    # Tokenizer & base model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(labels)
    )

    # Wrap model dengan LoRA
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # Prepare dataset
    ds_train = prepare_hf_dataset(train_texts, train_labels, tokenizer, max_length=MAX_LEN)
    ds_val   = prepare_hf_dataset(val_texts, val_labels, tokenizer, max_length=MAX_LEN)

    # TrainingArguments
    training_args = TrainingArguments(
        output_dir=os.path.join(OUTPUT_BASE, shortname),
        num_train_epochs=TRAIN_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="epoch",
        learning_rate=2e-4,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        metric_for_best_model="accuracy",
        load_best_model_at_end=True,
        greater_is_better=True,
        report_to=[]
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tokenizer,  # tambahkan tokenizer
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train & evaluate
    trainer.train()
    eval_res = trainer.evaluate()
    print(f"Eval results for {shortname}:", eval_res)

    # Save LoRA model
    save_dir = os.path.join(OUTPUT_BASE, shortname, "lora_model")
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"LoRA model saved to: {save_dir}")

    results[shortname] = {
        "model_name": model_name,
        "eval": eval_res,
        "save_dir": save_dir,
        "tokenizer": tokenizer
    }



Fine-tuning bert with LoRA (bert-base-uncased)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 605,204 || all params: 110,102,824 || trainable%: 0.5497


Map: 100%|██████████| 4000/4000 [01:32<00:00, 43.39 examples/s]
Map: 100%|██████████| 1000/1000 [00:24<00:00, 41.24 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3689,1.243051,0.64,0.52087,0.64,0.568359
2,1.1351,1.0042,0.703,0.604814,0.703,0.648634
3,1.0154,0.957339,0.71,0.618754,0.71,0.658708


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Eval results for bert: {'eval_loss': 0.9573387503623962, 'eval_accuracy': 0.71, 'eval_precision': 0.6187541557291949, 'eval_recall': 0.71, 'eval_f1': 0.6587084656251998, 'eval_runtime': 3.2426, 'eval_samples_per_second': 308.395, 'eval_steps_per_second': 19.429, 'epoch': 3.0}
LoRA model saved to: ./hf_finetune_results_lora\bert\lora_model

Fine-tuning roberta with LoRA (roberta-base)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,195,796 || all params: 125,856,808 || trainable%: 0.9501


Map: 100%|██████████| 4000/4000 [00:39<00:00, 101.30 examples/s]
Map: 100%|██████████| 1000/1000 [00:11<00:00, 88.42 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0617,0.919921,0.715,0.674144,0.715,0.687791
2,0.977,0.836819,0.732,0.696458,0.732,0.708573
3,0.8205,0.795536,0.744,0.725588,0.744,0.724323


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Eval results for roberta: {'eval_loss': 0.7955358624458313, 'eval_accuracy': 0.744, 'eval_precision': 0.7255876907374668, 'eval_recall': 0.744, 'eval_f1': 0.7243232750636168, 'eval_runtime': 3.2169, 'eval_samples_per_second': 310.855, 'eval_steps_per_second': 19.584, 'epoch': 3.0}
LoRA model saved to: ./hf_finetune_results_lora\roberta\lora_model

Fine-tuning distilbert with LoRA (distilbert-base-uncased)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: No modules were targeted for adaptation. This might be caused by a combination of mismatched target modules and excluded modules. Please check your `target_modules` and `exclude_modules` configuration. You may also have only targeted modules that are marked to be saved (`modules_to_save`).

In [33]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

save_dir = "./hf_finetune_results_lora/roberta/lora_model"
base_model_name = "roberta-base"

# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=len(labels)
)
# Wrap dengan LoRA
model = PeftModel.from_pretrained(base_model, save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model.eval()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Mod

In [36]:

# Prediksi
def predict_text(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    with torch.no_grad():
        outputs = model(**inputs)
    pred_class = torch.argmax(outputs.logits, dim=1).item()
    return id2label[pred_class]

print(predict_text("AUTOMORPHISMS AND IDEALS OF NONCOMMUTATIVE DEF..."))

cs


In [35]:
test_small_df.head()

Unnamed: 0,field,fulltext
3401,math,AUTOMORPHISMS AND IDEALS OF NONCOMMUTATIVE DEF...
4490,math,\nSOME WEAK VERSIONS OF THE M1-SPACES\n\nFUCAI...
23226,math,\nNoname manuscript No.\n(will be inserted by ...
16858,quant-ph,\nA complicated Duffing oscillator in the surf...
18797,cs,1\nInternational Journal of Scient ific & E...
