<a href="https://colab.research.google.com/github/asxd-10/cis5300_project/blob/main/notebooks/simple_baseline_section_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import torch
import os
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [3]:
!pip install -q transformers datasets jsonlines scikit-learn

In [4]:
!git clone https://github.com/asxd-10/cis5300_project.git

Cloning into 'cis5300_project'...
remote: Enumerating objects: 155, done.[K
remote: Counting objects: 100% (155/155), done.[K
remote: Compressing objects: 100% (131/131), done.[K
remote: Total 155 (delta 70), reused 71 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (155/155), 14.14 MiB | 7.11 MiB/s, done.
Resolving deltas: 100% (70/70), done.


In [5]:
import sys
sys.path.append('cis5300_project')

print('Contents of cis5300_project directory:')
!ls -F cis5300_project/

def load_pubmed_rct(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("###"):
                continue
            label, sentence = line.split("\t", 1)
            data.append((label, sentence))
    return data

print("Loading PubMed RCT data")

train_data = load_pubmed_rct('cis5300_project/data/pubmed_rct/train.txt')
dev_data   = load_pubmed_rct('cis5300_project/data/pubmed_rct/dev.txt')
test_data  = load_pubmed_rct('cis5300_project/data/pubmed_rct/test.txt')

print(f"{len(train_data)} training sentences")
print(f"{len(dev_data)} dev sentences")
print(f"{len(test_data)} test sentences")

Contents of cis5300_project directory:
data/		     notebooks/  requirements.txt  src/
download_scifact.sh  README.md	 setup.sh
Loading PubMed RCT data
180040 training sentences
30212 dev sentences
30135 test sentences


In [9]:
print(train_data[0])

('OBJECTIVE', 'To investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .')


In [10]:
labels = sorted(set([label for label, _ in train_data]))
labels

['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']

In [11]:
# preprocessing- only id mapping and lowercasing for simple baseline

In [12]:
label2id = {
    "BACKGROUND": 0,
    "OBJECTIVE": 1,
    "METHODS": 2,
    "RESULTS": 3,
    "CONCLUSIONS": 4
}
id2label = {v:k for k,v in label2id.items()}


In [13]:
def preprocess(text):
    return text.strip().lower()

In [14]:
train_texts = [preprocess(s) for _, s in train_data]
train_labels = [label2id[label] for label, _ in train_data]

dev_texts = [preprocess(s) for _, s in dev_data]
dev_labels = [label2id[label] for label, _ in dev_data]

test_texts = [preprocess(s) for _, s in test_data]
test_labels = [label2id[label] for label, _ in test_data]

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),
    max_features=50000
)

vectorizer.fit(train_texts)

In [23]:
from collections import Counter

# Find majority label in training data
label_counts = Counter(train_labels)
majority_label = label_counts.most_common(1)[0][0]

# Predict majority label for dev and test
pred_dev = [majority_label] * len(dev_labels)
pred_test = [majority_label] * len(test_labels)

In [None]:
# def majority_baseline_predict(data, majority_label):
#     return [majority_label for _ in data]

In [None]:
# def accuracy(gold, pred):
#     correct = sum(g == p for g, p in zip(gold, pred))
#     return correct / len(gold)

In [24]:
from sklearn.metrics import accuracy_score, f1_score

# Dev metrics
acc_dev = accuracy_score(dev_labels, pred_dev)
macro_f1_dev = f1_score(dev_labels, pred_dev, average='macro')

print("Dev Accuracy:", acc_dev)
print("Dev Macro-F1:", macro_f1_dev)

Dev Accuracy: 0.32980272739308886
Dev Macro-F1: 0.09920350457984867


In [25]:
with open("simple_baseline_dev_predictions.txt", "w") as f:
    for p in pred_dev:
        f.write(f"{p}\n")

In [26]:
with open("simple_baseline_test_predictions.txt", "w") as f:
    for p in pred_test:
        f.write(f"{p}\n")

In [17]:
# from sklearn.linear_model import LogisticRegression

# # Define classifier
# clf = LogisticRegression(
#     max_iter=2000,
#     multi_class='multinomial',
#     solver='lbfgs',
#     class_weight='balanced'
# )

# clf.fit(X_train, train_labels)



In [18]:
# pred_dev = clf.predict(X_dev)

# pred_test = clf.predict(X_test)

In [19]:
# from sklearn.metrics import accuracy_score, f1_score

# # Dev set metrics
# acc_dev = accuracy_score(dev_labels, pred_dev)
# macro_f1_dev = f1_score(dev_labels, pred_dev, average='macro')

# print("Dev Accuracy:", acc_dev)
# print("Dev Macro-F1:", macro_f1_dev)

# # Test set metrics
# acc_test = accuracy_score(test_labels, pred_test)
# macro_f1_test = f1_score(test_labels, pred_test, average='macro')

# print("Test Accuracy:", acc_test)
# print("Test Macro-F1:", macro_f1_test)

Dev Accuracy: 0.8176552363299351
Dev Macro-F1: 0.7598782842808343
Test Accuracy: 0.8098556495769039
Test Macro-F1: 0.7513176986166511
