In [1]:
# Clone the SleepQA repo
!git clone https://github.com/IvaBojic/SleepQA.git
%cd SleepQA

# Install key dependencies
!pip install transformers faiss-cpu datasets scikit-learn pandas tqdm


Cloning into 'SleepQA'...
remote: Enumerating objects: 400, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 400 (delta 46), reused 43 (delta 19), pack-reused 313 (from 1)[K
Receiving objects: 100% (400/400), 31.13 MiB | 5.56 MiB/s, done.
Resolving deltas: 100% (176/176), done.
Updating files: 100% (134/134), done.
Filtering content: 100% (3/3), 1.21 GiB | 25.78 MiB/s, done.
/content/SleepQA
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-an

In [2]:
# Install key dependencies, including rank_bm25
!pip install transformers faiss-cpu datasets scikit-learn pandas tqdm rank_bm25 # Added rank_bm25 here

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
!pip install transformers[torch] # Installs with additional requirements
!pip install biobert-embedding

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0->transformers[torch])
  Downloading nvidia_cufft_cu12

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import faiss
import numpy as np
import time
import torch
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt  # Add at the top if not already

# 1. Load data
corpus = pd.read_csv("data/training/sleep-corpus.tsv", sep="\t", header=None)
corpus.columns = ['index', 'passage', 'title']
test_qs = pd.read_csv("data/training/sleep-test.csv", sep="\t", header=None)
test_qs.columns = ['question', 'answer']
test_qs

In [None]:
from rank_bm25 import BM25Okapi
tokenized_corpus = [doc.split(" ") for doc in corpus["passage"]]  # Tokenize for BM25
print(tokenized_corpus[:1])

# Create BM25 model
bm25 = BM25Okapi(tokenized_corpus)

# Define query_bm25 function
def query_bm25(query, bm25_model, top_k=5):
    tokenized_query = query.split(" ")
    scores = bm25_model.get_scores(tokenized_query)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    top_passages = corpus.iloc[top_indices]
    return top_passages, [scores[i] for i in top_indices]
query = "who has an enviable life?"
passages, top_scores = query_bm25(query, bm25, top_k=3)

# Output the results
print("Top Passages:")
print(passages)
print("\nBM25 Scores:")
print(top_scores)
#answer = answer_bert_batched(query, passages, bert_tokenizer, bert_model, batch_size=16)


In [None]:
import torch
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm
from transformers import BertTokenizer, BertForQuestionAnswering

# Hyperparameters
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load fine-tuned BERT model and tokenizer (SQuAD2.0)
bert_tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1-squad')
bert_model = BertForQuestionAnswering.from_pretrained('dmis-lab/biobert-base-cased-v1.1-squad').to(device)

# Function to handle BERT predictions with batching
def answer_bert_batched(question, passages, tokenizer, model, batch_size=16):
    all_inputs = []
    for passage in passages:
        inputs = tokenizer.encode_plus(question, passage, add_special_tokens=True, return_tensors="pt", padding=False)
        all_inputs.append(inputs)

    all_answers = []
    for i in range(0, len(all_inputs), batch_size):
        batch_inputs = all_inputs[i:i+batch_size]

        # Pad the input sequences to the same length
        input_ids = torch.nn.utils.rnn.pad_sequence([x["input_ids"].squeeze(0) for x in batch_inputs], batch_first=True).to(device)
        attention_mask = torch.nn.utils.rnn.pad_sequence([x["attention_mask"].squeeze(0) for x in batch_inputs], batch_first=True).to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_scores, end_scores = outputs.start_logits, outputs.end_logits

        for idx in range(input_ids.size(0)):
            start_idx = torch.argmax(start_scores[idx]).item()
            end_idx = torch.argmax(end_scores[idx]).item()

            # Debugging the indexes of the start and end tokens
            print(f"\nFull tokenized passage: {tokenizer.convert_ids_to_tokens(input_ids[idx].tolist())}")
            print(f"Predicted start index: {start_idx}")
            print(f"Predicted end index: {end_idx}")

            # Ensure valid answer spans (start_idx should be <= end_idx)
            if start_idx <= end_idx:
                answer_tokens = input_ids[idx][start_idx:end_idx + 1]
                answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))
            else:
                answer = ""

            # Ensure that the answer is not just [CLS] or empty
            if answer.strip() == "[CLS]" or not answer.strip():
                print(f"Warning: The model predicted an invalid answer (either [CLS] or empty).")
                answer = ""

            all_answers.append(answer)

    return all_answers[0] if all_answers else ""

# Function to evaluate pipeline with different top_k values
def evaluate_pipeline_with_k_values(pipeline_id, ks=[1, 20, 40, 60, 80, 100]):
    results = {}

    for k in ks:
        all_answers = []
        y_true = []
        y_pred = []

        print(f"\nEvaluating Pipeline {pipeline_id} with top_k={k}...")

        for idx, row in tqdm(test_qs.iterrows(), total=len(test_qs), desc=f"top_k={k}"):
            question = row["question"]
            true_answer = row["answer"]

            passages, scores = query_bm25(question, bm25, top_k=k)

            if pipeline_id == 1:
                # Ensure the answer is a single string
                answer = answer_bert_batched(question, passages, bert_tokenizer, bert_model, batch_size=BATCH_SIZE)

            all_answers.append((answer, true_answer))
            y_true.append(true_answer)
            y_pred.append(answer)

        # Calculate exact match, precision, recall, and F1
        exact_match = sum([1 if ans[0].strip().lower() == ans[1].strip().lower() else 0 for ans in all_answers])
        precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
        recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)

        results[k] = {
            "Exact Match": exact_match / len(all_answers),
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        }

    return results

# Run the experiment
pipeline_1_metrics_by_k = evaluate_pipeline_with_k_values(1, ks=[1, 20, 40, 60, 80, 100])

# Print results
print("\n=================== Evaluation by Top-K ===================")
for k, metrics in pipeline_1_metrics_by_k.items():
    print(f"\nk = {k}")
    for metric_name, value in metrics.items():
        print(f"- {metric_name}: {value:.4f}")

# Final report
print("\n=================== Training Summary ===================")
print(f"Hyperparameters Used:")
print(f"- Learning Rate: {LEARNING_RATE}")
print(f"- Batch Size: {BATCH_SIZE}")
print(f"- Epochs: {EPOCHS}")
print(f"\nComputational Requirements:")
print(f"- Hardware: {'GPU' if device == 'cuda' else 'CPU'}")
print(f"- Models Trained: 1 (BERT-based model)")

print("\n=================== Average Evaluation Metrics ===================")
for k, metrics in pipeline_1_metrics_by_k.items():
    print(f"\nTop-K = {k}")
    for metric_name, value in metrics.items():
        print(f"- {metric_name}: {value:.4f}")
