<a href="https://colab.research.google.com/github/asxd-10/cis5300_project/blob/main/cis5300_project/notebooks/section_classification_simple_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import os
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [4]:
!pip install -q transformers datasets jsonlines scikit-learn

In [5]:
!git clone https://github.com/asxd-10/cis5300_project.git

Cloning into 'cis5300_project'...
remote: Enumerating objects: 96, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 96 (delta 30), reused 73 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (96/96), 14.13 MiB | 11.70 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [6]:
import sys
sys.path.append('cis5300_project')

print('Contents of cis5300_project directory:')
!ls -F cis5300_project/

def load_pubmed_rct(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("###"):
                continue
            label, sentence = line.split("\t", 1)
            data.append((label, sentence))
    return data

print("Loading PubMed RCT data")

train_data = load_pubmed_rct('cis5300_project/data/pubmed_rct/train.txt')
dev_data   = load_pubmed_rct('cis5300_project/data/pubmed_rct/dev.txt')
test_data  = load_pubmed_rct('cis5300_project/data/pubmed_rct/test.txt')

print(f"{len(train_data)} training sentences")
print(f"{len(dev_data)} dev sentences")
print(f"{len(test_data)} test sentences")

Contents of cis5300_project directory:
data/		     notebooks/  requirements.txt  src/
download_scifact.sh  README.md	 setup.sh
Loading PubMed RCT data
180040 training sentences
30212 dev sentences
30135 test sentences


In [7]:
# preprocessing- only id mapping and lowercasing for simple baseline

In [8]:
label2id = {
    "BACKGROUND": 0,
    "OBJECTIVE": 1,
    "METHODS": 2,
    "RESULTS": 3,
    "CONCLUSIONS": 4
}
id2label = {v:k for k,v in label2id.items()}


In [9]:
def preprocess(text):
    return text.strip().lower()

In [10]:
from collections import Counter

# Count labels in training data
label_counts = Counter([label for label, sent in train_data])

print(label_counts)
majority_label = label_counts.most_common(1)[0][0]

print("Majority Label:", majority_label)

Counter({'METHODS': 59353, 'RESULTS': 57953, 'CONCLUSIONS': 27168, 'BACKGROUND': 21727, 'OBJECTIVE': 13839})
Majority Label: METHODS


In [11]:
def majority_baseline_predict(data, majority_label):
    return [majority_label for _ in data]

In [12]:
def accuracy(gold, pred):
    correct = sum(g == p for g, p in zip(gold, pred))
    return correct / len(gold)

In [13]:
from sklearn.metrics import f1_score
gold_dev = [label for label, sent in dev_data]
pred_dev = majority_baseline_predict(dev_data, majority_label)

acc = accuracy(gold_dev, pred_dev)
print("Dev Accuracy:", acc)

macro_f1 = f1_score(gold_dev, pred_dev, average="macro")
print("Dev Macro-F1:", macro_f1)

Dev Accuracy: 0.32980272739308886
Dev Macro-F1: 0.09920350457984867
