In [13]:
import pandas as pd

def parse_pubmed_rct(file_path):
    abstracts = []
    with open(file_path, 'r') as f:
        current_abstract = []
        for line in f:
            line = line.strip()
            if line.startswith('###'):
                if current_abstract:
                    abstracts.append(current_abstract)
                current_abstract = []
            elif line and '\t' in line:
                label, sentence = line.split('\t', 1)
                current_abstract.append((sentence, label))
        if current_abstract:
            abstracts.append(current_abstract)
    return abstracts

# Parse the file (replace with your path)
abstracts = parse_pubmed_rct("train.txt")

# Flatten into a DataFrame
data = []
for abstract in abstracts:
    data.extend(abstract)
df = pd.DataFrame(data, columns=['sentence', 'label'])

print(df.head())

print(df.isna().sum())

                                            sentence       label
0  The emergence of HIV as a chronic condition me...  BACKGROUND
1  This paper describes the design and evaluation...  BACKGROUND
2  This study is designed as a randomised control...     METHODS
3  The intervention group will participate in the...     METHODS
4  The program is based on self-efficacy theory a...     METHODS
sentence    0
label       0
dtype: int64


In [14]:
df['sentence'] = df['sentence'].str.lower().str.strip()

print(df.head())


                                            sentence       label
0  the emergence of hiv as a chronic condition me...  BACKGROUND
1  this paper describes the design and evaluation...  BACKGROUND
2  this study is designed as a randomised control...     METHODS
3  the intervention group will participate in the...     METHODS
4  the program is based on self-efficacy theory a...     METHODS


In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
label2id = dict(zip(le.classes_, le.transform(le.classes_)))

print(df['label_encoded'].unique())
print(df.head())

[0 2 1 4 3]
                                            sentence       label  \
0  the emergence of hiv as a chronic condition me...  BACKGROUND   
1  this paper describes the design and evaluation...  BACKGROUND   
2  this study is designed as a randomised control...     METHODS   
3  the intervention group will participate in the...     METHODS   
4  the program is based on self-efficacy theory a...     METHODS   

   label_encoded  
0              0  
1              0  
2              2  
3              2  
4              2  


In [21]:
from sklearn.model_selection import train_test_split

labeled_df, unlabeled_df = train_test_split(df, test_size=0.9, stratify=df['label'], random_state=42)

# Drop the labels for the unlabeled part
unlabeled_df = unlabeled_df.copy()
unlabeled_df['label_encoded'] = -1  # Use -1 to indicate unknown

print(labeled_df.head())
print(unlabeled_df.head())

                                                  sentence       label  \
613887   exclusion criteria were arteritis , age > @ ye...     METHODS   
1636024  of the @ dcr procedures , @ were performed wit...     METHODS   
2105682  tamoxifen preserves bone in postmenopausal wom...  BACKGROUND   
1228437  studies by the veterans administration coopera...   OBJECTIVE   
1817455  group @ ( control ) without renal dysfunction ...     METHODS   

         label_encoded  
613887               2  
1636024              2  
2105682              0  
1228437              3  
1817455              2  
                                                  sentence    label  \
11610    the validity of the subgrouping rule was suppo...  RESULTS   
410177   productivity costs were calculated according t...  METHODS   
1646457  all evaluation and training sessions were perf...  METHODS   
1875259  recovery of corneal sensation and epithelial i...  RESULTS   
545515   tolerability/comfort was evaluated using 

In [11]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   ----------------- ---------------------- 4.5/10.4 MB 20.7 MB/s eta 0:00:01
   -------------------------------- ------- 8.4/10.4 MB 20.0 MB/s eta 0:00:01
   -------------


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: C:\Users\Arnav\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [20]:
combined_df = pd.concat([labeled_df, unlabeled_df]).reset_index(drop=True)
train_df, val_df = train_test_split(labeled_df, test_size=0.2, stratify=labeled_df['label'], random_state=42)
df['label'].value_counts()

print(combined_df.head())

combined_df.to_csv('combined_data.csv', index=False)


                                            sentence       label  \
0  exclusion criteria were arteritis , age > @ ye...     METHODS   
1  of the @ dcr procedures , @ were performed wit...     METHODS   
2  tamoxifen preserves bone in postmenopausal wom...  BACKGROUND   
3  studies by the veterans administration coopera...   OBJECTIVE   
4  group @ ( control ) without renal dysfunction ...     METHODS   

   label_encoded  
0              2  
1              2  
2              0  
3              3  
4              2  


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load combined data
combined_df = pd.read_csv("combined_data.csv")

# Split labeled portion again for validation
labeled_df = combined_df[combined_df['label_encoded'] != -1]
train_df, val_df = train_test_split(labeled_df, test_size=0.2, stratify=labeled_df['label_encoded'], random_state=42)

# Separate the unlabeled data
unlabeled_df = combined_df[combined_df['label_encoded'] == -1]

# Merge it with the labeled training set
semi_supervised_train_df = pd.concat([train_df, unlabeled_df]).reset_index(drop=True)





In [2]:
from transformers import AutoTokenizer

MODEL_NAME = "huawei-noah/TinyBERT_General_4L_312D"  # Or use: "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [3]:
import torch
from torch.utils.data import Dataset

class PubMedRCTSSL(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sentence = row['sentence']
        label = row['label_encoded']

        inputs = self.tokenizer(sentence, padding="max_length", truncation=True,
                                max_length=self.max_length, return_tensors="pt")

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': label
        }


In [4]:
from torch.utils.data import DataLoader

train_dataset = PubMedRCTSSL(semi_supervised_train_df, tokenizer)
val_dataset = PubMedRCTSSL(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [5]:
import torch.nn as nn
from transformers import AutoModel

class EPASSModel(nn.Module):
    def __init__(self, base_model, hidden_dim=312, num_classes=5, num_projectors=3):
        super().__init__()
        self.encoder = base_model
        self.projectors = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim)
            ) for _ in range(num_projectors)
        ])
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # CLS token

        projector_outputs = torch.stack([proj(pooled) for proj in self.projectors], dim=0)
        z_avg = projector_outputs.mean(dim=0)

        logits = self.classifier(z_avg)
        return logits, z_avg, projector_outputs


In [6]:
def evaluate(model, val_loader):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    all_probs = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits, _, _ = model(input_ids, attention_mask)
            probs = F.softmax(logits, dim=1)
            preds = probs.argmax(dim=1)

            loss = F.cross_entropy(logits, labels)
            total_loss += loss.item()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    # Metrics
    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='macro', zero_division=0
    )

    # AUC metrics (if binary classification)
    auc_prc = average_precision_score(all_labels, [p[1] if len(p) > 1 else 0 for p in all_probs])
    try:
        if len(set(all_labels)) == 2:
            auc_roc = roc_auc_score(all_labels, [p[1] for p in all_probs])
        else:
            auc_roc = None
    except:
        auc_roc = None

    metrics = {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc_prc': auc_prc,
        'auc_roc': auc_roc
    }

    return total_loss / len(val_loader), metrics



In [8]:
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score,average_precision_score, classification_report


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load base model
base_model = AutoModel.from_pretrained(MODEL_NAME)
model = EPASSModel(base_model, hidden_dim=312, num_classes=len(combined_df['label_encoded'].unique()) - 1).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
epochs = 5

train_losses = []
val_losses = []

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits, z_avg, _ = model(input_ids, attention_mask)

        labeled_mask = labels != -1
        labeled_logits = logits[labeled_mask]
        labeled_labels = labels[labeled_mask]

        if labeled_logits.shape[0] > 0:
            loss = F.cross_entropy(labeled_logits, labeled_labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}")

    # Validation
    val_loss, val_metrics = evaluate(model, val_loader)
    val_losses.append(val_loss)

    print(f"Val Loss = {val_loss:.4f} | Acc = {val_metrics['accuracy']:.4f} | "
          f"F1 = {val_metrics['f1']:.4f} | AUC-PRC = {val_metrics['auc_prc']:.4f}")



  0%|                                                                           | 106/67739 [01:18<13:56:01,  1.35it/s]


KeyboardInterrupt: 