In [2]:
from google.colab import drive
drive.mount('/content/drive')

import zipfile
import os

zip_path = "/content/drive/MyDrive/forcI-dataset.zip"
extract_path = "/content/forcI-dataset"

# Create folder if not exists
os.makedirs(extract_path, exist_ok=True)

# Extract ZIP
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction completed!")
print("Files in extracted folder:", os.listdir(extract_path))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extraction completed!
Files in extracted folder: ['__MACOSX', 'forcI-dataset']


In [3]:
import os

dataset_path = "/content/forcI-dataset/forcI-dataset"
files = os.listdir(dataset_path)

print("Files inside forcI-dataset:")
for f in files:
    print("-", f)


Files inside forcI-dataset:
- val.csv
- train.csv
- test.csv


In [4]:
import pandas as pd

dataset_path = "/content/forcI-dataset/forcI-dataset"

train_df = pd.read_csv(f"{dataset_path}/train.csv")
val_df = pd.read_csv(f"{dataset_path}/val.csv")
test_df = pd.read_csv(f"{dataset_path}/test.csv")

print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print("Test shape:", test_df.shape)

print("\nTrain columns:", list(train_df.columns))

print("\nExample row from training set:")
print(train_df.iloc[0])


Train shape: (41540, 10)
Val shape: (8901, 10)
Test shape: (8903, 10)

Train columns: ['abstract', 'author', 'doi', 'url', 'publication month', 'publication year', 'title', 'publisher', 'label', 'data_index']

Example row from training set:
abstract             the production of b jets in association with a...
author                                               CMS Collaboration
doi                                            10.1007/JHEP06(2012)126
url                                                                NaN
publication month                                                  NaN
publication year                                                   NaN
title                Measurement of the Z/gamma* + b-jet cross sect...
publisher                                           JHEP 06 (2012) 126
label                                                          Physics
data_index                                                       44436
Name: 0, dtype: object


In [5]:
import pandas as pd

def build_input_text(row):
    """
    Converts a dataset row into a structured metadata-enhanced text string.
    """

    title = row.get("title", "")
    abstract = row.get("abstract", "")
    authors = row.get("author", "")
    year = str(row.get("publication year", ""))
    publisher = row.get("publisher", "")
    doi = row.get("doi", "")

    # Extract DOI prefix (part before first slash)
    if isinstance(doi, str) and "/" in doi:
        doi_prefix = doi.split("/")[0]
    else:
        doi_prefix = str(doi)

    # Replace NaNs with "unknown"
    def safe(x):
        if pd.isna(x) or x == "" or str(x).lower() == "nan":
            return "unknown"
        return str(x)

    text = (
        "[TITLE]\n" + safe(title) + "\n\n"
        "[ABSTRACT]\n" + safe(abstract) + "\n\n"
        "[AUTHORS]\n" + safe(authors) + "\n\n"
        "[YEAR]\n" + safe(year) + "\n\n"
        "[DOI]\n" + safe(doi_prefix) + "\n\n"
        "[PUBLISHER]\n" + safe(publisher)
    )

    return text


# Apply the function to create a new "input_text" column
train_df["input_text"] = train_df.apply(build_input_text, axis=1)
val_df["input_text"] = val_df.apply(build_input_text, axis=1)
test_df["input_text"] = test_df.apply(build_input_text, axis=1)

print("Example processed text:\n")
print(train_df["input_text"].iloc[0][:600], "...")  # show first 600 chars


Example processed text:

[TITLE]
Measurement of the Z/gamma* + b-jet cross section in pp collisions at 7
  TeV

[ABSTRACT]
the production of b jets in association with a z/gamma* boson is studied using proton-proton collisions delivered by the lhc at a centre-of-mass energy of 7 tev and recorded by the cms detector. the inclusive cross section for z/gamma* + b-jet production is measured in a sample corresponding to an integrated luminosity of 2.2 inverse femtobarns. the z/gamma* + b-jet cross section with z/gamma* to ll (where ll = ee or mu mu) for events with the invariant mass 60 < m(ll) < 120 gev, at least one b je ...


In [6]:
# !pip install -q transformers==4.43.3
# !pip install -q peft
# !pip install -q accelerate
# !pip install -q bitsandbytes


In [7]:
#!pip install -U transformers accelerate


In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
train_df["label_id"] = label_encoder.fit_transform(train_df["label"])
val_df["label_id"] = label_encoder.transform(val_df["label"])

num_labels = len(label_encoder.classes_)
print("Number of FoRC classes:", num_labels)
print("\nClass names:", list(label_encoder.classes_))

# Create label mappings
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}

print("\nExample mapping:", list(label2id.items())[:10])

# Prepare dataset dicts for HF Trainer
train_dataset = {
    "text": train_df["input_text"].tolist(),
    "labels": train_df["label_id"].tolist(),
}

val_dataset = {
    "text": val_df["input_text"].tolist(),
    "labels": val_df["label_id"].tolist(),
}

test_dataset = {
    "text": test_df["input_text"].tolist()
}

print("\nTrain samples:", len(train_dataset["text"]))
print("Val samples:", len(val_dataset["text"]))
print("Test samples:", len(test_dataset["text"]))


Number of FoRC classes: 123

Class names: ['Algebra', 'Algebraic Geometry', 'Analysis', 'Animal Sciences', 'Applied Mathematics', 'Applied Statistics', 'Artificial Intelligence', 'Arts and Humanities', 'Astrophysics and Astronomy', 'Atmospheric Sciences', 'Atomic, Molecular and Optical Physics', 'Audio and Speech Processing', 'Bioinformatics', 'Biological and Chemical Physics', 'Biomedical Engineering and Bioengineering', 'Category Theory', 'Cell Behavior', 'Chemistry', 'Civil and Environmental Engineering', 'Communication Technology and New Media', 'Complex Variables', 'Computational Engineering', 'Computational Geometry', 'Computational Linguistics', 'Computational Physics', 'Computer Engineering', 'Computer Science and Game Theory', 'Computer Sciences', 'Computer Vision and Pattern Recognition', 'Computer and Systems Architecture', 'Computers and Society', 'Condensed Matter Physics', 'Controls and Control Theory', 'Cosmology', 'Cosmology, Relativity, and Gravity', 'Cryptography and 

In [9]:
from transformers import AutoTokenizer

model_name = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

MAX_LEN = 1024

def tokenize_batch(text_list):
    return tokenizer(
        text_list,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

# Tokenize small sample to verify
sample = tokenize_batch(train_dataset["text"][:2])

print("Input IDs shape:", sample["input_ids"].shape)
print("Attention mask shape:", sample["attention_mask"].shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Input IDs shape: torch.Size([2, 1024])
Attention mask shape: torch.Size([2, 1024])


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class FoRCDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        # squeeze removes the extra batch dimension
        features = {
            "input_ids": item["input_ids"].squeeze(0),
            "attention_mask": item["attention_mask"].squeeze(0),
        }
        if self.labels is not None:
            features["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)

        return features

# Create dataset objects
train_data = FoRCDataset(train_dataset["text"], train_dataset["labels"])
val_data = FoRCDataset(val_dataset["text"], val_dataset["labels"])
test_data = FoRCDataset(test_dataset["text"], labels=None)

# Dataloaders
train_loader = DataLoader(train_data, batch_size=2, shuffle=True)   # small batch for testing
val_loader   = DataLoader(val_data, batch_size=2, shuffle=False)

# Test one batch
batch = next(iter(train_loader))

print("Batch input_ids:", batch["input_ids"].shape)
print("Batch attention_mask:", batch["attention_mask"].shape)
print("Batch labels:", batch["labels"].shape)


Batch input_ids: torch.Size([2, 1024])
Batch attention_mask: torch.Size([2, 1024])
Batch labels: torch.Size([2])


In [13]:
for name, module in model.named_modules():
    if "attn" in name.lower() or "attention" in name.lower():
        print(name)


model.layers.0.attn_norm
model.layers.0.attn
model.layers.0.attn.Wqkv
model.layers.0.attn.rotary_emb
model.layers.0.attn.Wo
model.layers.0.attn.out_drop
model.layers.1.attn_norm
model.layers.1.attn
model.layers.1.attn.Wqkv
model.layers.1.attn.rotary_emb
model.layers.1.attn.Wo
model.layers.1.attn.out_drop
model.layers.2.attn_norm
model.layers.2.attn
model.layers.2.attn.Wqkv
model.layers.2.attn.rotary_emb
model.layers.2.attn.Wo
model.layers.2.attn.out_drop
model.layers.3.attn_norm
model.layers.3.attn
model.layers.3.attn.Wqkv
model.layers.3.attn.rotary_emb
model.layers.3.attn.Wo
model.layers.3.attn.out_drop
model.layers.4.attn_norm
model.layers.4.attn
model.layers.4.attn.Wqkv
model.layers.4.attn.rotary_emb
model.layers.4.attn.Wo
model.layers.4.attn.out_drop
model.layers.5.attn_norm
model.layers.5.attn
model.layers.5.attn.Wqkv
model.layers.5.attn.rotary_emb
model.layers.5.attn.Wo
model.layers.5.attn.out_drop
model.layers.6.attn_norm
model.layers.6.attn
model.layers.6.attn.Wqkv
model.layers

In [14]:
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

model_name = "answerdotai/ModernBERT-base"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

print("Base model loaded.")



lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["Wqkv", "Wo"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()




Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded.
trainable params: 2,392,443 || all params: 152,091,894 || trainable%: 1.5730


In [None]:
#!pip install -U transformers==4.43.3 peft==0.11.1 accelerate bitsandbytes


In [22]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
import torch

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

training_args = TrainingArguments(
    output_dir="./modernbert_forci_lora",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,   # << FIXED
    eval_dataset=val_data,      # << FIXED
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer


  trainer = Trainer(


<transformers.trainer.Trainer at 0x7a1face97560>

In [23]:
trainer.train()


W1129 16:27:40.758000 3004 torch/_inductor/utils.py:1558] [1/0_1] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,4.4928,1.083428,0.671722,0.465731


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,4.4928,1.083428,0.671722,0.465731
2,3.3128,0.910761,0.723514,0.546128


KeyboardInterrupt: 

In [24]:
model.save_pretrained("./checkpoint-epoch2")
tokenizer.save_pretrained("./checkpoint-epoch2")


('./checkpoint-epoch2/tokenizer_config.json',
 './checkpoint-epoch2/special_tokens_map.json',
 './checkpoint-epoch2/tokenizer.json')

In [25]:
import os
save_path = "/content/drive/MyDrive/modernbert_epoch2"

os.makedirs(save_path, exist_ok=True)
print("Saving to:", save_path)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


Saving to: /content/drive/MyDrive/modernbert_epoch2


('/content/drive/MyDrive/modernbert_epoch2/tokenizer_config.json',
 '/content/drive/MyDrive/modernbert_epoch2/special_tokens_map.json',
 '/content/drive/MyDrive/modernbert_epoch2/tokenizer.json')

In [16]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [18]:
!pip install -U transformers


