# ðŸ”¬ Fine-tune SciBERT for Resource Mention Classification
This notebook loads your training data, formats it for SciBERT, fine-tunes the model, and saves it.

In [None]:
!pip install "transformers>=4.30"
!pip install --quiet datasets scikit-learn

In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

## ðŸ“¤ Upload Training CSV

In [None]:

from google.colab import files
uploaded = files.upload()

infile = list(uploaded.keys())[0]
print(f"input file name: {infile}")

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## ðŸ§¹ Prepare the Dataset

In [None]:
import pandas as pd

infile = "training_set_sentences.full.csv"
# infile = 'train_split.csv'
df = pd.read_csv(infile)
df = df[["paragraph_text", "label", "matched_term"]]
df


## ðŸ§  Tokenize and Split

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# 1. Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# 2. Define tokenizer (with chunking logic)
def tokenize(batch):
    # input_texts = [text + " [SEP] " + alias for text, alias in zip(batch["paragraph_text"], batch["matched_term"])]

    encodings = tokenizer(
        # input_texts,
        batch["matched_term"],
        batch["paragraph_text"],
        return_overflowing_tokens=True,
        truncation=True,
        stride=128,
        padding="max_length",
        max_length=512
    )

    # Expand the labels to match the number of chunks per example
    new_labels = []
    for i in range(len(encodings["input_ids"])):
        example_idx = encodings["overflow_to_sample_mapping"][i]
        new_labels.append(batch["label"][example_idx])

    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "label": new_labels,
    }

# 3. Tokenize and remove non-input columns
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
print(tokenized_dataset)

# 4. Split
encoded_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
print(encoded_dataset)

## ðŸš€ Fine-Tune SciBERT

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

training_args = TrainingArguments(
    output_dir="./scibert_resource_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics
)




In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
print(metrics)

## ðŸ’¾ Save Trained Model

In [None]:
model.save_pretrained("scibert_resource_classifier")
tokenizer.save_pretrained("scibert_resource_classifier")

!zip -r scibert_resource_classifier.zip scibert_resource_classifier
from google.colab import files
files.download("scibert_resource_classifier.zip")


In [None]:
!ls -lh