In [1]:
! git clone https://github.com/CSCI5832-team-30/CSCI5832-Shared-Task.git

Cloning into 'CSCI5832-Shared-Task'...
remote: Enumerating objects: 1060, done.[K
remote: Counting objects: 100% (1060/1060), done.[K
remote: Compressing objects: 100% (1052/1052), done.[K
remote: Total 1060 (delta 15), reused 1037 (delta 5), pack-reused 0[K
Receiving objects: 100% (1060/1060), 3.29 MiB | 12.08 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [2]:
!pip install transformers torch



In [7]:
import os
import json
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset

# Load data
train_path = "/content/CSCI5832-Shared-Task/training_data/train.json"
with open(train_path) as json_file:
    train_data = json.load(json_file)

#Load dev set
dev_path = "/content/CSCI5832-Shared-Task/training_data/dev.json"
with open(dev_path) as json_file:
    val_data = json.load(json_file)

# Split data into train, validation, and test sets
train_data, test_data = train_test_split(list(train_data.values()), test_size=0.2, random_state=42)

val_data= list(val_data.values())

# Define a custom dataset class
class ClinicalTrialsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        statement = item["Statement"]
        label = 1 if item["Label"] == "Entailment" else 0  # Convert labels to binary (0 or 1)

        inputs = self.tokenizer(
            statement,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training parameters
batch_size = 8
num_epochs = 8
learning_rate = 0.000001

# Create datasets and data loaders
train_dataset = ClinicalTrialsDataset(train_data, tokenizer)
test_dataset = ClinicalTrialsDataset(test_data, tokenizer)
val_dataset = ClinicalTrialsDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    all_train_preds = []
    all_train_labels = []
    for batch in train_loader:
        inputs = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Record predictions and labels for training evaluation
        preds = torch.argmax(outputs.logits, dim=1)
        all_train_preds.extend(preds.cpu().numpy())
        all_train_labels.extend(labels.cpu().numpy())

    # Calculate and print training F1 score
    train_f_score = f1_score(all_train_labels, all_train_preds)
    print(f"Epoch {epoch + 1} - Training F1 Score: {train_f_score:.4f}")

# Evaluation on test set
model.eval()
all_test_preds = []
all_test_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        outputs = model(inputs, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_test_preds.extend(preds.cpu().numpy())
        all_test_labels.extend(labels.cpu().numpy())

# Calculate and print test set F1 score
test_f_score = f1_score(all_test_labels, all_test_preds)
print(f"Test Set F1 Score: {test_f_score:.4f}")

# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        inputs = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        outputs = model(inputs, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate and print evaluation metrics
f_score = f1_score(all_labels, all_preds)
p_score = precision_score(all_labels, all_preds)
r_score = recall_score(all_labels, all_preds)

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training F1 Score: 0.5459
Epoch 2 - Training F1 Score: 0.4942
Epoch 3 - Training F1 Score: 0.5122
Epoch 4 - Training F1 Score: 0.5405
Epoch 5 - Training F1 Score: 0.5131
Epoch 6 - Training F1 Score: 0.5361
Epoch 7 - Training F1 Score: 0.5904
Epoch 8 - Training F1 Score: 0.5695
Test Set F1 Score: 0.5504
F1:0.598131
precision_score:0.561404
recall_score:0.640000
