# 🔬 Fine-tune SciBERT for Resource Mention Classification
This notebook loads your training data, formats it for SciBERT, fine-tunes the model, and saves it.

In [None]:
!pip install "transformers>=4.30"
!pip install --quiet datasets scikit-learn



In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

## 📤 Upload Training CSV

In [None]:

from google.colab import files
uploaded = files.upload()

infile = list(uploaded.keys())[0]
print(f"input file name: {infile}")

Saving training_set_sentences.full.csv to training_set_sentences.full (1).csv
input file name: training_set_sentences.full (1).csv


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## 🧹 Prepare the Dataset

In [18]:
import pandas as pd

infile = "training_set_sentences.full.csv"
# infile = 'train_split.csv'
df = pd.read_csv(infile)
df = df[["paragraph_text", "label", "matched_term"]]
df


Unnamed: 0,paragraph_text,label,matched_term
0,The entire test is carried out in the CI engin...,0,BAR
1,The test used three injection pressures (210 b...,0,BAR
2,Each experimental trial was conducted with a c...,0,BAR
3,A mechanical stirrer is a laboratory magnetic ...,0,BAR
4,BAR + FTI therapy synergistically extended the...,0,BAR
...,...,...,...
1432,"Subsequently, we constructed a protein-protein...",1,STRING
1433,Figure S1: Melt curve analysis of the RT-qPCR ...,1,STRING
1434,Protein-protein interactions (physical and fun...,1,STRING
1435,The proteins visualized via String database 12...,1,STRING


## 🧠 Tokenize and Split

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# 1. Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# 2. Define tokenizer (with chunking logic)
def tokenize(batch):
    # input_texts = [text + " [SEP] " + alias for text, alias in zip(batch["paragraph_text"], batch["matched_term"])]

    encodings = tokenizer(
        # input_texts,
        batch["matched_term"],
        batch["paragraph_text"],
        return_overflowing_tokens=True,
        truncation=True,
        stride=128,
        padding="max_length",
        max_length=512
    )

    # Expand the labels to match the number of chunks per example
    new_labels = []
    for i in range(len(encodings["input_ids"])):
        example_idx = encodings["overflow_to_sample_mapping"][i]
        new_labels.append(batch["label"][example_idx])

    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "label": new_labels,
    }

# 3. Tokenize and remove non-input columns
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
print(tokenized_dataset)

# 4. Split
encoded_dataset = tokenized_dataset.train_test_split(test_size=0.2)

Map:   0%|          | 0/1437 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 1452
})


In [None]:
print(encoded_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1161
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 291
    })
})


## 🚀 Fine-Tune SciBERT

In [17]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

training_args = TrainingArguments(
    output_dir="./scibert_resource_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics
)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.07801998406648636, 'eval_accuracy': 0.9862542955326461, 'eval_precision': 0.9845360824742269, 'eval_recall': 0.9947916666666666, 'eval_f1': 0.9896373056994818, 'eval_runtime': 8.2603, 'eval_samples_per_second': 35.229, 'eval_steps_per_second': 4.479, 'epoch': 4.0}


## 💾 Save Trained Model

In [None]:
model.save_pretrained("scibert_resource_classifier")
tokenizer.save_pretrained("scibert_resource_classifier")

!zip -r scibert_resource_classifier.zip scibert_resource_classifier
from google.colab import files
files.download("scibert_resource_classifier.zip")


  adding: scibert_resource_classifier/ (stored 0%)
  adding: scibert_resource_classifier/checkpoint-584/ (stored 0%)
  adding: scibert_resource_classifier/checkpoint-584/training_args.bin (deflated 52%)
  adding: scibert_resource_classifier/checkpoint-584/config.json (deflated 49%)
  adding: scibert_resource_classifier/checkpoint-584/model.safetensors (deflated 7%)
  adding: scibert_resource_classifier/checkpoint-584/trainer_state.json (deflated 70%)
  adding: scibert_resource_classifier/checkpoint-584/optimizer.pt (deflated 24%)
  adding: scibert_resource_classifier/checkpoint-584/scheduler.pt (deflated 56%)
  adding: scibert_resource_classifier/checkpoint-584/rng_state.pth (deflated 25%)
  adding: scibert_resource_classifier/tokenizer_config.json (deflated 74%)
  adding: scibert_resource_classifier/config.json (deflated 49%)
  adding: scibert_resource_classifier/model.safetensors (deflated 7%)
  adding: scibert_resource_classifier/special_tokens_map.json (deflated 42%)
  adding: scib

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls -lh

total 4.4G
drwxr-xr-x 1 root root 4.0K Jun 17 13:37 sample_data
drwxr-xr-x 6 root root 4.0K Jun 19 13:14 scibert_resource_classifier
-rw-r--r-- 1 root root 4.4G Jun 19 13:34 scibert_resource_classifier.zip
-rw-r--r-- 1 root root 366K Jun 19 12:48 training_set_sentences.full.csv
