In [1]:
from google.colab import drive
drive.mount('/content/drive')

import zipfile
import os

zip_path = "/content/drive/MyDrive/forcI-dataset.zip"
extract_path = "/content/forcI-dataset"

# Create extraction folder
os.makedirs(extract_path, exist_ok=True)

# Extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction completed!")
print("Extracted to:", extract_path)


Mounted at /content/drive
Extraction completed!
Extracted to: /content/forcI-dataset


In [2]:
import pandas as pd

base_path = "/content/forcI-dataset/forcI-dataset"

train_path = f"{base_path}/train.csv"
test_path  = f"{base_path}/test.csv"
val_path   = f"{base_path}/val.csv"

# Load CSVs
train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)
val_df   = pd.read_csv(val_path)

print("=== Loaded Successfully ===")
print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Val shape  :", val_df.shape)

print("\nTrain Columns:", train_df.columns.tolist())

# Quick preview
train_df.head()


=== Loaded Successfully ===
Train shape: (41540, 10)
Test shape : (8903, 10)
Val shape  : (8901, 10)

Train Columns: ['abstract', 'author', 'doi', 'url', 'publication month', 'publication year', 'title', 'publisher', 'label', 'data_index']


Unnamed: 0,abstract,author,doi,url,publication month,publication year,title,publisher,label,data_index
0,the production of b jets in association with a...,CMS Collaboration,10.1007/JHEP06(2012)126,,,,Measurement of the Z/gamma* + b-jet cross sect...,JHEP 06 (2012) 126,Physics,44436
1,instabilities in the price dynamics of a large...,"Giacomo Bormetti, Lucio Maria Calcagnile, Mich...",,,,,Modelling systemic price cojumps with Hawkes f...,,Quantitative Finance,46308
2,large information sizes in samples and feature...,"David Banh, Alan Huang",,,,,Encoding large information structures in linea...,,Machine Learning,52468
3,we consider polygonal billiards with collision...,"Gianluigi Del Magno, Jo\~ao Lopes Dias, Pedro ...",,,,,Hyperbolic polygonal billiards close to 1-dime...,,Dynamics/Dynamical Systems,12449
4,Bauxite deposits of Jharkhand in India are res...,"['E.N. Dhanamjaya Rao', 'A.T. Jeyaseelan', 'K....",10.1016/j.jag.2012.08.003,https://doi.org/10.1016/j.jag.2012.08.003,4.0,2013.0,analysis of aster data for mapping bauxite ric...,International Journal of Applied Earth Observa...,Sociology,43248


In [3]:
# Colab Cell: install libs, create HF Datasets, load tokenizers, tokenize (max_len=512)
!pip install -q transformers datasets accelerate evaluate

import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Reuse the already-loaded pandas DataFrames from previous cells:
# train_df, val_df, test_df, and label_encoder (sklearn LabelEncoder)
# If for any reason they are not present, load from CSVs again:
base_path = "/content/forcI-dataset/forcI-dataset"
if 'train_df' not in globals():
    train_df = pd.read_csv(os.path.join(base_path, "train.csv"))
    val_df   = pd.read_csv(os.path.join(base_path, "val.csv"))
    test_df  = pd.read_csv(os.path.join(base_path, "test.csv"))

# combine title + abstract (if not already done)
def combine_text_row(row):
    title = str(row["title"]) if pd.notna(row["title"]) else ""
    abstract = str(row["abstract"]) if pd.notna(row["abstract"]) else ""
    return title + " " + abstract

for df in [train_df, val_df, test_df]:
    if "text" not in df.columns:
        df["text"] = df.apply(combine_text_row, axis=1)

# Ensure label_encoder exists (from previous cell). If not, create it.
try:
    label_encoder  # noqa: F821
except NameError:
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    label_encoder.fit(train_df["label"].astype(str))

num_labels = len(label_encoder.classes_)
print(f"Number of classes detected: {num_labels}")

# Create HF datasets
hf_train = Dataset.from_pandas(train_df[["text", "label"]].rename(columns={"label":"labels"}))
hf_val   = Dataset.from_pandas(val_df[["text", "label"]].rename(columns={"label":"labels"}))
hf_test  = Dataset.from_pandas(test_df[["text", "label"]].rename(columns={"label":"labels"}))

# Convert string labels -> integer labels using label_encoder and map
def encode_label(example):
    example["labels"] = int(label_encoder.transform([str(example["labels"])])[0])
    return example

hf_train = hf_train.map(encode_label)
hf_val   = hf_val.map(encode_label)
hf_test  = hf_test.map(encode_label)

dataset_dict = DatasetDict({"train": hf_train, "validation": hf_val, "test": hf_test})
print(dataset_dict)

# Model tokenizer names matching the paper
tokenizer_names = {
    "scibert_uncased": "allenai/scibert_scivocab_uncased",
    "scibert_cased"  : "allenai/scibert_scivocab_cased",
    "roberta_base"   : "roberta-base",
    "deberta_v3_small": "microsoft/deberta-v3-small"
}

# Load tokenizers (we'll keep them in a dict)
tokenizers = {}
MAX_LEN = 512

def make_tokenizer(name, model_id):
    print(f"Loading tokenizer for {name} -> {model_id}")
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    # ensure tokenizer has padding side set (not required but consistent)
    tok.padding_side = "right"
    return tok

for k, v in tokenizer_names.items():
    tokenizers[k] = make_tokenizer(k, v)

# Tokenization function (truncation to 512, pad to max_length to match paper's max-length)
def tokenize_fn(examples, tokenizer):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

# Tokenize datasets for each tokenizer and store (but only print summary to avoid large output)
tokenized_datasets = {}
for name, tok in tokenizers.items():
    print(f"\nTokenizing with {name} (this may take a minute)...")
    tokenized = dataset_dict.map(lambda ex: tokenize_fn(ex, tok), batched=True, remove_columns=["text"])
    # set format to PyTorch tensors
    tokenized.set_format(type="torch")
    tokenized_datasets[name] = tokenized
    print(f"  -> {name} tokenized splits sizes:", {k: len(v) for k, v in tokenized.items()})

# Show a sample tokenized example from train for scibert_uncased
sample = tokenized_datasets["scibert_uncased"]["train"][0]
print("\nSample tokenized example (scibert_uncased, train[0]):")
print(" input_ids length:", len(sample["input_ids"]))
print(" attention_mask length:", len(sample["attention_mask"]))
print(" label:", sample["labels"])
print(" first 30 input_ids:", sample["input_ids"][:30])

print("\nAll tokenizers prepared and datasets tokenized with max_length=512 and padding='max_length'.")
print("Next step: build models + Trainer with Focal Loss, training args (bs=16, epochs=5, AdamW + linear warmup).")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hNumber of classes detected: 123


Map:   0%|          | 0/41540 [00:00<?, ? examples/s]

Map:   0%|          | 0/8901 [00:00<?, ? examples/s]

Map:   0%|          | 0/8903 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 41540
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 8901
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 8903
    })
})
Loading tokenizer for scibert_uncased -> allenai/scibert_scivocab_uncased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Loading tokenizer for scibert_cased -> allenai/scibert_scivocab_cased


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Loading tokenizer for roberta_base -> roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loading tokenizer for deberta_v3_small -> microsoft/deberta-v3-small


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]




Tokenizing with scibert_uncased (this may take a minute)...


Map:   0%|          | 0/41540 [00:00<?, ? examples/s]

Map:   0%|          | 0/8901 [00:00<?, ? examples/s]

Map:   0%|          | 0/8903 [00:00<?, ? examples/s]

  -> scibert_uncased tokenized splits sizes: {'train': 41540, 'validation': 8901, 'test': 8903}

Tokenizing with scibert_cased (this may take a minute)...


Map:   0%|          | 0/41540 [00:00<?, ? examples/s]

Map:   0%|          | 0/8901 [00:00<?, ? examples/s]

Map:   0%|          | 0/8903 [00:00<?, ? examples/s]

  -> scibert_cased tokenized splits sizes: {'train': 41540, 'validation': 8901, 'test': 8903}

Tokenizing with roberta_base (this may take a minute)...


Map:   0%|          | 0/41540 [00:00<?, ? examples/s]

Map:   0%|          | 0/8901 [00:00<?, ? examples/s]

Map:   0%|          | 0/8903 [00:00<?, ? examples/s]

  -> roberta_base tokenized splits sizes: {'train': 41540, 'validation': 8901, 'test': 8903}

Tokenizing with deberta_v3_small (this may take a minute)...


Map:   0%|          | 0/41540 [00:00<?, ? examples/s]

Map:   0%|          | 0/8901 [00:00<?, ? examples/s]

Map:   0%|          | 0/8903 [00:00<?, ? examples/s]

  -> deberta_v3_small tokenized splits sizes: {'train': 41540, 'validation': 8901, 'test': 8903}

Sample tokenized example (scibert_uncased, train[0]):
 input_ids length: 512
 attention_mask length: 512
 label: tensor(95)
 first 30 input_ids: tensor([  102,  2560,   131,   111,   447,  1352,  8831,  1375,   473,   132,
          579, 10459,  2057,  1155,   121,  1679, 14328,   235,   450, 25583,
          111,  1865,   131,   132, 22821,   121,  2274,   190,   106,   447])

All tokenizers prepared and datasets tokenized with max_length=512 and padding='max_length'.
Next step: build models + Trainer with Focal Loss, training args (bs=16, epochs=5, AdamW + linear warmup).


In [9]:
# Cell: Build SciBERT-uncased model + Trainer with Focal Loss and train
import math, os
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
import torch
import torch.nn as nn
import numpy as np
from evaluate import load as load_metric
from datasets import DatasetDict

# Parameters (paper-aligned)
MODEL_NAME = "allenai/scibert_scivocab_uncased"
NUM_LABELS = len(label_encoder.classes_)  # 123
BATCH_SIZE = 16
EPOCHS = 5
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
FP16 = True  # use mixed precision on T4
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True,  # safe if head size mismatch
)

# Data: use tokenized_datasets["scibert_uncased"] produced earlier
hf_data = tokenized_datasets["scibert_uncased"]  # DatasetDict with train/validation/test (torch format)

# Data collator (no dynamic padding needed because we padded to max_length, but safe)
data_collator = DataCollatorWithPadding(tokenizer=tokenizers["scibert_uncased"], return_tensors="pt")

# Focal Loss implementation
class FocalLoss(nn.Module):
    def __init__(self, gamma: float = 2.0, alpha: float = 1.0, reduction: str = "mean"):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction
        self.ce = nn.CrossEntropyLoss(reduction="none")

    def forward(self, logits, targets):
        # logits: (batch, num_classes), targets: (batch,)
        ce_loss = self.ce(logits, targets)  # per-sample CE
        probs = torch.softmax(logits, dim=-1)
        pt = probs.gather(1, targets.unsqueeze(1)).squeeze(1)  # p_t
        focal_factor = (1.0 - pt) ** self.gamma
        loss = self.alpha * focal_factor * ce_loss
        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        return loss

focal_loss_fn = FocalLoss(gamma=2.0, alpha=1.0, reduction="mean")

# Custom Trainer to use Focal Loss (override compute_loss)
class FocalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # add **kwargs
        labels = inputs.get("labels")
        # remove labels from inputs passed to model if present
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        if labels is None:
            # fallback to default behavior
            loss = outputs["loss"] if "loss" in outputs else torch.tensor(0.0).to(logits.device)
        else:
            loss = focal_loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# Metrics (accuracy + weighted precision/recall/f1)
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    # weighted precision/recall/f1
    prec = precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"]
    rec = recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"]
    f1w = f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
    return {
        "accuracy": acc,
        "precision_weighted": prec,
        "recall_weighted": rec,
        "f1_weighted": f1w,
    }

# TrainingArgs
output_dir = "/content/forc_scibert_uncased_out"
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    fp16=FP16,
    seed=SEED,
    dataloader_num_workers=4,
)

trainer = FocalTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_data["train"],
    eval_dataset=hf_data["validation"],
    tokenizer=tokenizers["scibert_uncased"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Print a short summary before training
print("=== Trainer and model prepared ===")
print("Model:", MODEL_NAME)
print("Num labels:", NUM_LABELS)
print("Train samples:", len(hf_data["train"]))
print("Val samples:", len(hf_data["validation"]))
print("Epochs:", EPOCHS, "Batch size:", BATCH_SIZE)
print("FocalLoss gamma=2.0, alpha=1.0")
print("Output dir:", output_dir)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== Trainer and model prepared ===
Model: allenai/scibert_scivocab_uncased
Num labels: 123
Train samples: 41540
Val samples: 8901
Epochs: 5 Batch size: 16
FocalLoss gamma=2.0, alpha=1.0
Output dir: /content/forc_scibert_uncased_out


  trainer = FocalTrainer(


In [10]:
# Start training
train_result = trainer.train()
trainer.save_model(output_dir)

# Evaluate on validation and test
print("\n=== Evaluation on validation set ===")
val_metrics = trainer.evaluate(hf_data["validation"])
print(val_metrics)

print("\n=== Evaluation on test set ===")
test_metrics = trainer.evaluate(hf_data["test"])
print(test_metrics)

# Save label encoder mapping for later use
import json
label_map = {int(i): c for i, c in enumerate(label_encoder.classes_)}
with open(os.path.join(output_dir, "label_map.json"), "w") as f:
    json.dump(label_map, f, indent=2)

print("\nTraining complete. Artifacts saved to", output_dir)



Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
1,1.0079,0.919472,0.660263,0.635372,0.660263,0.630433
2,0.6828,0.731682,0.707112,0.695505,0.707112,0.691798


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
1,1.0079,0.919472,0.660263,0.635372,0.660263,0.630433
2,0.6828,0.731682,0.707112,0.695505,0.707112,0.691798
3,0.4079,0.678803,0.726098,0.719695,0.726098,0.71657
4,0.2874,0.667996,0.734749,0.731541,0.734749,0.729394
5,0.1895,0.684925,0.735086,0.733367,0.735086,0.730736


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Evaluation on validation set ===




{'eval_loss': 0.6849253177642822, 'eval_accuracy': 0.7350859453993933, 'eval_precision_weighted': 0.7333668299875331, 'eval_recall_weighted': 0.7350859453993933, 'eval_f1_weighted': 0.7307362790028343, 'eval_runtime': 68.1639, 'eval_samples_per_second': 130.582, 'eval_steps_per_second': 8.171, 'epoch': 5.0}

=== Evaluation on test set ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6932342648506165, 'eval_accuracy': 0.7309895540828935, 'eval_precision_weighted': 0.7244381547711796, 'eval_recall_weighted': 0.7309895540828935, 'eval_f1_weighted': 0.7257046791821179, 'eval_runtime': 68.0491, 'eval_samples_per_second': 130.832, 'eval_steps_per_second': 8.185, 'epoch': 5.0}

Training complete. Artifacts saved to /content/forc_scibert_uncased_out


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Folder in Drive to save the model
drive_model_path = '/content/drive/MyDrive/deep_learning_research/forc_scibert_model'
os.makedirs(drive_model_path, exist_ok=True)

# Save model + tokenizer
trainer.save_model(drive_model_path)
tokenizers["scibert_uncased"].save_pretrained(drive_model_path)

print("Model and tokenizer saved to Drive at:", drive_model_path)


Model and tokenizer saved to Drive at: /content/drive/MyDrive/deep_learning_research/forc_scibert_model


In [15]:
# Cell: Build SciBERT-cased model + Trainer with Focal Loss and train
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import torch.nn as nn
import numpy as np
from evaluate import load as load_metric

# Model and parameters
MODEL_NAME_CASED = "allenai/scibert_scivocab_cased"
NUM_LABELS = len(label_encoder.classes_)  # 123
BATCH_SIZE = 16
EPOCHS = 5
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
FP16 = True
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)

# Load model
model_cased = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME_CASED,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True,
)

# Use previously tokenized dataset for cased version
hf_data_cased = tokenized_datasets["scibert_cased"]

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizers["scibert_cased"], return_tensors="pt")

# Focal Loss (reuse from previous cell)
focal_loss_fn = FocalLoss(gamma=2.0, alpha=1.0, reduction="mean")

# Custom Trainer for Focal Loss
class FocalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss = focal_loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# Metrics (reuse)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    prec = precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"]
    rec = recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"]
    f1w = f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
    return {"accuracy": acc, "precision_weighted": prec, "recall_weighted": rec, "f1_weighted": f1w}

# Training arguments
output_dir = "/content/forc_scibert_cased_out"
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    fp16=FP16,
    seed=SEED,
    dataloader_num_workers=4,
)

# Initialize trainer
trainer_cased = FocalTrainer(
    model=model_cased,
    args=training_args,
    train_dataset=hf_data_cased["train"],
    eval_dataset=hf_data_cased["validation"],
    tokenizer=tokenizers["scibert_cased"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("=== SciBERT-cased Trainer prepared ===")
print("Model:", MODEL_NAME_CASED)
print("Num labels:", NUM_LABELS)
print("Train samples:", len(hf_data_cased["train"]))
print("Val samples:", len(hf_data_cased["validation"]))
print("Epochs:", EPOCHS, "Batch size:", BATCH_SIZE)
print("Output dir:", output_dir)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_cased = FocalTrainer(


=== SciBERT-cased Trainer prepared ===
Model: allenai/scibert_scivocab_cased
Num labels: 123
Train samples: 41540
Val samples: 8901
Epochs: 5 Batch size: 16
Output dir: /content/forc_scibert_cased_out


In [16]:
# Start training
train_result_cased = trainer_cased.train()
trainer_cased.save_model(output_dir)

# Evaluate
print("\n=== Evaluation on validation set ===")
val_metrics_cased = trainer_cased.evaluate(hf_data_cased["validation"])
print(val_metrics_cased)

print("\n=== Evaluation on test set ===")
test_metrics_cased = trainer_cased.evaluate(hf_data_cased["test"])
print(test_metrics_cased)



Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
1,1.0285,0.953702,0.640153,0.618164,0.640153,0.610204
2,0.7046,0.748266,0.694079,0.678718,0.694079,0.676755
3,0.4482,0.693345,0.714077,0.708845,0.714077,0.705553
4,0.3155,0.670765,0.728907,0.724228,0.728907,0.722964
5,0.2103,0.689124,0.723739,0.720809,0.723739,0.719178


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Evaluation on validation set ===




{'eval_loss': 0.6707652807235718, 'eval_accuracy': 0.7289068643972587, 'eval_precision_weighted': 0.7242284684720645, 'eval_recall_weighted': 0.7289068643972587, 'eval_f1_weighted': 0.7229636436302133, 'eval_runtime': 67.9585, 'eval_samples_per_second': 130.977, 'eval_steps_per_second': 8.196, 'epoch': 5.0}

=== Evaluation on test set ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6956053972244263, 'eval_accuracy': 0.7229023924519825, 'eval_precision_weighted': 0.7171220297113212, 'eval_recall_weighted': 0.7229023924519825, 'eval_f1_weighted': 0.7162690828855328, 'eval_runtime': 67.6786, 'eval_samples_per_second': 131.548, 'eval_steps_per_second': 8.23, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
from pathlib import Path

# Output path in Drive
output_dir_cased = "/content/drive/MyDrive/deep_learning_research/forc_scibert_cased_model"
Path(output_dir_cased).mkdir(parents=True, exist_ok=True)

# Save model and tokenizer
trainer_cased.save_model(output_dir_cased)
tokenizers["scibert_cased"].save_pretrained(output_dir_cased)

print("Model and tokenizer saved to Drive at:", output_dir_cased)


Model and tokenizer saved to Drive at: /content/drive/MyDrive/deep_learning_research/forc_scibert_cased_model
