## Google Drive Mounting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [3]:
%cd cis5300_project

/content/drive/MyDrive/cis5300_project


In [20]:
PROJECT_ROOT = '/content/drive/MyDrive/cis5300_project'

%cd $PROJECT_ROOT

print(f"Changed directory to: {PROJECT_ROOT}")
print("\nContents of this folder (you should see 'data', 'src', 'notebooks'):")
!ls -l

/content/drive/MyDrive/cis5300_project
Changed directory to: /content/drive/MyDrive/cis5300_project

Contents of this folder (you should see 'data', 'src', 'notebooks'):
total 15
drwx------ 5 root root 4096 Nov 17 18:15 data
-rw------- 1 root root 1305 Nov 17 18:15 download_scifact.sh
drwx------ 2 root root 4096 Nov 17 18:15 notebooks
-rw------- 1 root root    0 Nov 17 18:15 README.md
-rw------- 1 root root  400 Nov 17 18:15 requirements.txt
-rw------- 1 root root  751 Nov 17 18:15 setup.sh
drwx------ 2 root root 4096 Nov 17 18:15 src


## Installing Dependencies

In [4]:

!pip install -r requirements.txt

Collecting rank-bm25>=0.2.2 (from -r requirements.txt (line 13))
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting rouge-score>=0.1.2 (from -r requirements.txt (line 16))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score>=0.3.13 (from -r requirements.txt (line 17))
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting jsonlines>=4.0.0 (from -r requirements.txt (line 19))
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting black>=23.11.0 (from -r requirements.txt (line 26))
  Downloading black-25.11.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flake8>=6.1.0 (from -r requirements.txt (line 27))
  Downloading flake8-7.3.0-py2.py3-none-any.whl.metadata (3.8 k

In [1]:
!pip install transformers



In [67]:
import os
import sys
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    precision_recall_fscore_support
)

In [68]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU. (Training will be much slower!)")

PyTorch version: 2.8.0+cu126
CUDA available: True
Using GPU: Tesla T4


In [3]:
torch_version = tuple(int(x) for x in torch.__version__.split('.')[:2])
if torch_version < (2, 6):
    print("\nPyTorch version is < 2.6")
    print("   Loading models with safetensors instead...")
    USE_SAFETENSORS = True
else:
    print("PyTorch version is compatible")
    USE_SAFETENSORS = False

PyTorch version is compatible


## Data Paths and Label Constants

In [15]:
sys.path.append('src')

In [16]:
DATA_DIR = 'data/pubmed_rct'
TRAIN_PATH = os.path.join(DATA_DIR, 'train.txt')
DEV_PATH = os.path.join(DATA_DIR, 'dev.txt')
TEST_PATH = os.path.join(DATA_DIR, 'test.txt')

print(TRAIN_PATH)
print(DEV_PATH)
print(TEST_PATH)

data/pubmed_rct/train.txt
data/pubmed_rct/dev.txt
data/pubmed_rct/test.txt


In [17]:
LABELS = ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']


label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {i: label for i, label in enumerate(LABELS)}
print("Constants defined.")

Constants defined.


## Data Loading Function and Initial Loading of Data

In [18]:
def load_data(filepath):
    """Loads data from the PubMed 200K RCT format."""
    sentences = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            # Skip abstract separators (like '###24293578') and empty lines
            if line.startswith('###') or not line:
                continue

            parts = line.split('\t')
            if len(parts) == 2:
                label = parts[0]
                sentence = parts[1]
                # Ensure the label is one we are tracking
                if label in LABELS:
                    sentences.append(sentence)
                    labels.append(label2id[label]) # Convert label string to a number ID

    return sentences, labels

print("Data loading function defined.")

Data loading function defined.


In [21]:
train_texts, train_labels = load_data(TRAIN_PATH)
val_texts, val_labels = load_data(DEV_PATH)

print(f"Training data loaded: {len(train_texts)} sentences.")
print(f"Validation data loaded: {len(val_texts)} sentences.")
print("\n--- Sample ---")
print(f"Label: {id2label[train_labels[0]]} ({train_labels[0]})")
print(f"Text: {train_texts[0]}")

Training data loaded: 180040 sentences.
Validation data loaded: 30212 sentences.

--- Sample ---
Label: OBJECTIVE (1)
Text: To investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .


## Model and Tokenizer Initialization

In [23]:
MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased"

# Load the tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    print("SciBERT tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    print("Please ensure you have an internet connection and 'transformers' is installed.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

SciBERT tokenizer loaded successfully.


## Custom PyTorch Dataset Class Definition

In [24]:
class PubMedDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128, batch_size=1000):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Store the tokenized outputs here
        all_input_ids = []
        all_attention_masks = []

        print(f"Tokenizing {len(self.texts)} texts in batches...")

        for i in tqdm(range(0, len(self.texts), batch_size)):
            batch_texts = self.texts[i : i + batch_size]

            encodings = self.tokenizer(
                batch_texts,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_token_type_ids=False,
                return_attention_mask=True,
                return_tensors='pt'
            )

            all_input_ids.append(encodings['input_ids'])
            all_attention_masks.append(encodings['attention_mask'])

        # Concatenate all batches into single tensors
        self.encodings = {
            'input_ids': torch.cat(all_input_ids, dim=0),
            'attention_mask': torch.cat(all_attention_masks, dim=0)
        }

        # Clean up to save memory
        del all_input_ids
        del all_attention_masks

        print("Tokenization complete.")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

print("PubMedDataset class defined.")

PubMedDataset class defined.


### Instantiate Dataset Objects

In [29]:
train_dataset = PubMedDataset(train_texts, train_labels, tokenizer, max_len=128)
val_dataset = PubMedDataset(val_texts, val_labels, tokenizer, max_len=128)

print("\n--- Dataset objects created ---")

Tokenizing 180040 texts...
Tokenization complete.
Tokenizing 30212 texts...
Tokenization complete.

--- Dataset objects created ---


In [30]:
sample = train_dataset[0]
print(f"\nSample from train_dataset:")
print(f"Input IDs shape: {sample['input_ids'].shape}")
print(f"Attention Mask shape: {sample['attention_mask'].shape}")
print(f"Label: {sample['labels']}")

print("\nDecoded Input IDs (first 20 tokens):")
print(tokenizer.decode(sample['input_ids'][:20]))


Sample from train_dataset:
Input IDs shape: torch.Size([128])
Attention Mask shape: torch.Size([128])
Label: 1

Decoded Input IDs (first 20 tokens):
[CLS] to investigate the efficacy of @ weeks of daily low - dose oral prednisolone in improving pain,


## Load Pre-trained SciBERT Model for Classification

In [33]:
print("\nLoading model")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,      # the base model "allenai/scibert_scivocab_uncased"
    num_labels=len(LABELS),
    # Pass the label dictionaries. This saves them in the model's
    # config file,
    id2label=id2label, #  The {0: 'BACKGROUND', ...} map
    label2id=label2id # The {'BACKGROUND': 0, ...} map
)


Loading model


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
model.to(device)

print(f"\nModel '{MODEL_CHECKPOINT}' loaded successfully.")
print(f"Model is on device: {model.device}")


Model 'allenai/scibert_scivocab_uncased' loaded successfully.
Model is on device: cuda:0


## Defining Training Arugments

In [37]:
training_args = TrainingArguments(
    # Output and Logging
    output_dir='./models/scibert_section_classifier',
    logging_steps=1000,

    # Evaluation
    eval_strategy="epoch",  # Run evaluation at the end of each epoch
    save_strategy="epoch",        # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model (based on loss) when training is done

    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,

    # Optimizer and Scheduler
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,

    report_to="none",
    fp16=True
)

print("TrainingArguments defined.")

TrainingArguments defined.


## Define Evaluation Metrics Function

In [39]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1) # Get the index of the highest probability

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro' # good for imbalanced classes
    )

    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("compute_metrics function defined.")

compute_metrics function defined.


## Instantiate the Trainer

In [69]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

print("\n--- Trainer is initialized and ready. ---")


--- Trainer is initialized and ready. ---


  trainer = Trainer(


In [43]:
print("--- Starting training ---")

trainer.train()

print("--- Training complete ---")

--- Starting training ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3571,0.330231,0.881471,0.823963,0.845541,0.817157
2,0.2817,0.34662,0.884847,0.832179,0.844701,0.8278
3,0.2125,0.389597,0.882067,0.828301,0.834475,0.825121


--- Training complete ---


In [45]:
print("Saving the best model...")
trainer.save_model()
print(f"Model saved to {training_args.output_dir}")

Saving the best model...
Model saved to ./models/scibert_section_classifier


In [47]:
print("\n--- Making predictions on the validation set ---")
predictions_output = trainer.predict(val_dataset)
print(f"Prediction metrics: {predictions_output.metrics}")


--- Making predictions on the validation set ---


Prediction metrics: {'test_loss': 0.33023136854171753, 'test_accuracy': 0.8814709386998544, 'test_f1': 0.8239633991754761, 'test_precision': 0.8455411427051717, 'test_recall': 0.8171571603555432, 'test_runtime': 51.6495, 'test_samples_per_second': 584.943, 'test_steps_per_second': 9.158}


In [48]:
predicted_class_ids = np.argmax(predictions_output.predictions, axis=1)

In [49]:
predicted_labels = [id2label[class_id] for class_id in predicted_class_ids]

In [51]:
output_pred_file = os.path.join(training_args.output_dir, "val_predictions.txt")
with open(output_pred_file, 'w') as f:
    for label in predicted_labels:
        f.write(label + '\n')

print(f"\nPredictions saved to: {output_pred_file}")
print("Few sample predictions:")
for i in range(10):
    print(f"Prediction: {predicted_labels[i]:<12} | Actual: {id2label[val_labels[i]]:<12}")


Predictions saved to: ./models/scibert_section_classifier/val_predictions.txt
Few sample predictions:
Prediction: BACKGROUND   | Actual: BACKGROUND  
Prediction: BACKGROUND   | Actual: BACKGROUND  
Prediction: BACKGROUND   | Actual: OBJECTIVE   
Prediction: METHODS      | Actual: METHODS     
Prediction: METHODS      | Actual: METHODS     
Prediction: RESULTS      | Actual: RESULTS     
Prediction: RESULTS      | Actual: RESULTS     
Prediction: RESULTS      | Actual: RESULTS     
Prediction: RESULTS      | Actual: RESULTS     
Prediction: CONCLUSIONS  | Actual: CONCLUSIONS 


In [65]:
print("--- Calculating metrics manually (bypassing script) ---")

gold_labels_list = val_labels
pred_labels_list = predicted_class_ids

print(f"Total gold labels:     {len(gold_labels_list)}")
print(f"Total predicted labels: {len(pred_labels_list)}")

if len(gold_labels_list) == len(pred_labels_list):
    print("\nLabel counts match. Calculating scores...")

    # Calculate Accuracy
    acc = accuracy_score(gold_labels_list, pred_labels_list)

    # Calculate Macro-F1 Score
    macro_f1 = f1_score(gold_labels_list, pred_labels_list, average="macro")

    print("\n--- Final Metrics for Milestone 2 Report ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Macro-F1: {macro_f1:.4f}")

    print('\n\n')
    print("\n--- Detailed Classification Report ---")

    target_names = [id2label[i] for i in range(len(LABELS))]
    print(classification_report(gold_labels_list, pred_labels_list, target_names=target_names, digits=4))

else:
    print("\nError: Mismatch in label counts even in the notebook. This is unexpected.")

--- Calculating metrics manually (bypassing script) ---
Total gold labels:     30212
Total predicted labels: 30212

Label counts match. Calculating scores...

--- Final Metrics for Milestone 2 Report ---
Accuracy: 0.8815
Macro-F1: 0.8240




--- Detailed Classification Report ---
              precision    recall  f1-score   support

  BACKGROUND     0.6919    0.8202    0.7506      3449
   OBJECTIVE     0.8241    0.5324    0.6469      2376
     METHODS     0.9357    0.9556    0.9456      9964
     RESULTS     0.9298    0.9261    0.9280      9841
 CONCLUSIONS     0.8462    0.8514    0.8488      4582

    accuracy                         0.8815     30212
   macro avg     0.8455    0.8172    0.8240     30212
weighted avg     0.8836    0.8815    0.8794     30212



In [57]:
!git status

Refresh index: 100% (50/50), done.
On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mmodels/[m
	[31mnotebooks/train_scibert_section_classification.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)
