In [1]:
# Clone the SleepQA repo
!git clone https://github.com/IvaBojic/SleepQA.git
%cd SleepQA

# Install key dependencies
!pip install transformers faiss-cpu datasets scikit-learn pandas tqdm


Cloning into 'SleepQA'...
remote: Enumerating objects: 400, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 400 (delta 46), reused 43 (delta 19), pack-reused 313 (from 1)[K
Receiving objects: 100% (400/400), 31.13 MiB | 7.20 MiB/s, done.
Resolving deltas: 100% (176/176), done.
Updating files: 100% (134/134), done.
Filtering content: 100% (3/3), 1.21 GiB | 4.80 MiB/s, done.
/content/SleepQA
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any

In [6]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import faiss
import numpy as np
import time
import torch
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt  # Add at the top if not already
from transformers import BertTokenizer, BertForQuestionAnswering

# 1. Load data
corpus = pd.read_csv("data/training/sleep-corpus.tsv", sep="\t", header=None)
corpus.columns = ['index', 'passage', 'title']
test_qs = pd.read_csv("data/training/sleep-test.csv", sep="\t", header=None)
test_qs.columns = ['question', 'answer']

# 2. Setup models
models = {
    "BERT-SQuAD2": "deepset/bert-base-cased-squad2",
    "BioBERT": "dmis-lab/biobert-base-cased-v1.1",
    "BioBERT-BioASQ": "ktrapeznikov/biobert_v1.1_pubmed_squad_v2",
    "PubMedBERT": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    "SciBERT": "allenai/scibert_scivocab_uncased",
    "ClinicalBERT": "emilyalsentzer/Bio_ClinicalBERT"
}

# 3. Prepare training examples
train_examples = [
    InputExample(texts=[row["question"], row["answer"]], label=1.0)
    for _, row in test_qs.iterrows()
]

# 4. Set hyperparameters
EPOCHS = 30
BATCH_SIZE = 16
LEARNING_RATE = 2e-5  # (cannot be directly set in SentenceTransformer.fit)

# Track compute
total_gpu_time = 0
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Hardware: Using {'GPU' if device == 'cuda' else 'CPU'}")

# 5. Define FAISS search
def query_faiss(question, model, top_k=3):
    q_emb = model.encode([question], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)
    return [answer_texts[i] for i in I[0]], D[0]

# 6. Define evaluation metrics
def evaluate_model(model, test_df, ks=[1]):
    results = {}

    for k in ks:
        recall_at_k = []
        exact_match = []
        f1_scores = []

        for _, row in test_df.iterrows():
            question = row["question"]
            true_answer = row["answer"]

            retrieved_passages, _ = query_faiss(question, model, top_k=k)

            # Recall@k: if true answer appears in any retrieved passage
            match_found = any(true_answer.strip().lower() in passage.lower() for passage in retrieved_passages)
            recall_at_k.append(1 if match_found else 0)

            # Exact Match
            em = any(true_answer.strip().lower() == passage.strip().lower() for passage in retrieved_passages)
            exact_match.append(1 if em else 0)

            # F1 score (token level) with best candidate
            best_f1 = 0
            for passage in retrieved_passages:
                true_tokens = set(true_answer.lower().split())
                pred_tokens = set(passage.lower().split())
                common = true_tokens & pred_tokens
                if not common:
                    continue
                precision = len(common) / len(pred_tokens)
                recall = len(common) / len(true_tokens)
                f1 = 2 * precision * recall / (precision + recall)
                best_f1 = max(best_f1, f1)
            f1_scores.append(best_f1)

        # Store results for each k
        results[k] = {
            "Recall@k": np.mean(recall_at_k),
            "Exact Match": np.mean(exact_match),
            "F1 Score": np.mean(f1_scores)
        }

    return results

# 7. Train and evaluate each model
for model_name, model_id in models.items():
    print(f"\n### Training Model: {model_name} ###")

    model = SentenceTransformer(model_id, device=device)
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
    train_loss = losses.CosineSimilarityLoss(model)

    start_time = time.time()
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=EPOCHS,
        warmup_steps=10,
        show_progress_bar=True
    )
    end_time = time.time()

    epoch_time = (end_time - start_time) / EPOCHS
    total_gpu_time += (end_time - start_time)

    print(f"Training time: {end_time - start_time:.2f}s total | {epoch_time:.2f}s per epoch")

    # Compute embeddings
    answer_texts = test_qs["answer"].tolist()
    answer_embeddings = model.encode(answer_texts, convert_to_numpy=True, show_progress_bar=True)
    index = faiss.IndexFlatL2(answer_embeddings.shape[1])
    index.add(answer_embeddings)


    # Show top-3 results for first question
    sample_question = test_qs.iloc[0]["question"]
    results, distances = query_faiss(sample_question, model, top_k=3)

    print("\nTop-3 Similar Passages:")
    for i, (text, dist) in enumerate(zip(results, distances)):
        print(f"Rank {i+1}: {text[:100]}... (Distance: {dist:.4f})")

    # Save model
    os.makedirs("saved_models", exist_ok=True)
    model_save_path = f"saved_models/{model_name.replace(' ', '_')}"
    model.save(model_save_path)
    print(f"Model saved to: {model_save_path}")

    # Evaluate model

    # Initialize metrics dictionary
    if 'all_metrics' not in locals():
        all_metrics = {}

    # Evaluate model
    metrics = evaluate_model(model, test_qs, ks=[1,20,40,60,80,100])
    all_metrics[model_name] = metrics

    print(f"\nEvaluation Metrics for {model_name}:")
    for k, metrics_at_k in metrics.items():
        print(f"- k={k}:")
        for metric_name, value in metrics_at_k.items():
            print(f"    {metric_name}: {value:.4f}")  # Format the value to 4 decimal places


# 8. Final report
print("\n=================== Training Summary ===================")
print(f"Hyperparameters Used:")
print(f"- Learning Rate: {LEARNING_RATE} (default for transformers)")
print(f"- Batch Size: {BATCH_SIZE}")
print(f"- Epochs: {EPOCHS}")
print(f"\nComputational Requirements:")
print(f"- Hardware: {'GPU' if device == 'cuda' else 'CPU'}")
print(f"- Models Trained: {len(models)}")
print(f"- Average Epoch Time (last model): {epoch_time:.2f}s")
print(f"- Total GPU Time: {total_gpu_time/3600:.2f} GPU hours")
print(f"\nTraining Details:")
print(f"- Loss Function: CosineSimilarityLoss")


Hardware: Using GPU

### Training Model: BERT-SQuAD2 ###




config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0005


Training time: 185.52s total | 6.18s per epoch


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


Top-3 Similar Passages:
Rank 1: ["by affecting levels of adenosine, a substance that helps regulate sleep"]... (Distance: 0.3934)
Rank 2: ["taking steps to improve their sleep hygiene"]... (Distance: 0.3986)
Rank 3: ["the use of an electronic device to help patients learn to control functions of the body"]... (Distance: 0.3998)
Model saved to: saved_models/BERT-SQuAD2





Evaluation Metrics for BERT-SQuAD2:
- k=1:
    Recall@k: 0.0580
    Exact Match: 0.0580
    F1 Score: 0.0976
- k=20:
    Recall@k: 0.2340
    Exact Match: 0.2340
    F1 Score: 0.3612
- k=40:
    Recall@k: 0.3280
    Exact Match: 0.3280
    F1 Score: 0.4615
- k=60:
    Recall@k: 0.3980
    Exact Match: 0.3980
    F1 Score: 0.5269
- k=80:
    Recall@k: 0.4560
    Exact Match: 0.4560
    F1 Score: 0.5762
- k=100:
    Recall@k: 0.4960
    Exact Match: 0.4960
    F1 Score: 0.6117

### Training Model: BioBERT ###


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0004


Training time: 184.55s total | 6.15s per epoch


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


Top-3 Similar Passages:
Rank 1: ["it can make you more vulnerable to respiratory illness"]... (Distance: 0.1013)
Rank 2: ["regular listening may be more effective at improving sleep quality"]... (Distance: 0.1093)
Rank 3: ["someone who feels foggy and tired"]... (Distance: 0.1127)
Model saved to: saved_models/BioBERT





Evaluation Metrics for BioBERT:
- k=1:
    Recall@k: 0.0420
    Exact Match: 0.0420
    F1 Score: 0.0806
- k=20:
    Recall@k: 0.1460
    Exact Match: 0.1460
    F1 Score: 0.2851
- k=40:
    Recall@k: 0.2120
    Exact Match: 0.2120
    F1 Score: 0.3621
- k=60:
    Recall@k: 0.2600
    Exact Match: 0.2600
    F1 Score: 0.4086
- k=80:
    Recall@k: 0.3160
    Exact Match: 0.3160
    F1 Score: 0.4582
- k=100:
    Recall@k: 0.3440
    Exact Match: 0.3440
    F1 Score: 0.4846

### Training Model: BioBERT-BioASQ ###


config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.001


Training time: 184.26s total | 6.14s per epoch


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


Top-3 Similar Passages:
Rank 1: ["to keep a journal with notes about how well and how long you sleep each night"]... (Distance: 0.4161)
Rank 2: ["takes a long time to change shape under pressure"]... (Distance: 0.4217)
Rank 3: ["create a new bedtime song for your kids"]... (Distance: 0.4248)
Model saved to: saved_models/BioBERT-BioASQ

Evaluation Metrics for BioBERT-BioASQ:
- k=1:
    Recall@k: 0.0540
    Exact Match: 0.0540
    F1 Score: 0.0981
- k=20:
    Recall@k: 0.2120
    Exact Match: 0.2120
    F1 Score: 0.3434
- k=40:
    Recall@k: 0.2780
    Exact Match: 0.2780
    F1 Score: 0.4148
- k=60:
    Recall@k: 0.3740
    Exact Match: 0.3740
    F1 Score: 0.4988
- k=80:
    Recall@k: 0.4280
    Exact Match: 0.4280
    F1 Score: 0.5464
- k=100:
    Recall@k: 0.4620
    Exact Match: 0.4620
    F1 Score: 0.5750

### Training Model: PubMedBERT ###




Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0001


Training time: 179.88s total | 6.00s per epoch


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


Top-3 Similar Passages:
Rank 1: ["situations that could trigger heightened worry"]... (Distance: 2.4295)
Rank 2: ["progress for someone who already has a diagnosis of night eating syndrome"]... (Distance: 2.5958)
Rank 3: ["healthy sleep habits, as children learn through parent modeling"]... (Distance: 2.7451)
Model saved to: saved_models/PubMedBERT





Evaluation Metrics for PubMedBERT:
- k=1:
    Recall@k: 0.0440
    Exact Match: 0.0440
    F1 Score: 0.0789
- k=20:
    Recall@k: 0.1840
    Exact Match: 0.1840
    F1 Score: 0.3075
- k=40:
    Recall@k: 0.2580
    Exact Match: 0.2580
    F1 Score: 0.3920
- k=60:
    Recall@k: 0.3220
    Exact Match: 0.3220
    F1 Score: 0.4553
- k=80:
    Recall@k: 0.3780
    Exact Match: 0.3780
    F1 Score: 0.5090
- k=100:
    Recall@k: 0.4220
    Exact Match: 0.4220
    F1 Score: 0.5477

### Training Model: SciBERT ###


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0011


Training time: 180.80s total | 6.03s per epoch


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


Top-3 Similar Passages:
Rank 1: ["worsen the symptoms and severity of dementia"]... (Distance: 0.6256)
Rank 2: ["to reduce the risk of complications"]... (Distance: 0.6283)
Rank 3: ["by affecting levels of adenosine, a substance that helps regulate sleep"]... (Distance: 0.6531)
Model saved to: saved_models/SciBERT





Evaluation Metrics for SciBERT:
- k=1:
    Recall@k: 0.0360
    Exact Match: 0.0360
    F1 Score: 0.0774
- k=20:
    Recall@k: 0.1480
    Exact Match: 0.1480
    F1 Score: 0.2788
- k=40:
    Recall@k: 0.2260
    Exact Match: 0.2260
    F1 Score: 0.3698
- k=60:
    Recall@k: 0.2800
    Exact Match: 0.2800
    F1 Score: 0.4212
- k=80:
    Recall@k: 0.3180
    Exact Match: 0.3180
    F1 Score: 0.4583
- k=100:
    Recall@k: 0.3620
    Exact Match: 0.3620
    F1 Score: 0.4990

### Training Model: ClinicalBERT ###


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0004


Training time: 185.16s total | 6.17s per epoch


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


Top-3 Similar Passages:
Rank 1: ["the amount of time adults spent in different sleep stages"]... (Distance: 0.1247)
Rank 2: ["hormones that help control appetite and hunger"]... (Distance: 0.1307)
Rank 3: ["anyone who wants extra space for storage"]... (Distance: 0.1415)
Model saved to: saved_models/ClinicalBERT

Evaluation Metrics for ClinicalBERT:
- k=1:
    Recall@k: 0.0300
    Exact Match: 0.0300
    F1 Score: 0.0649
- k=20:
    Recall@k: 0.1460
    Exact Match: 0.1460
    F1 Score: 0.2813
- k=40:
    Recall@k: 0.2040
    Exact Match: 0.2040
    F1 Score: 0.3525
- k=60:
    Recall@k: 0.2420
    Exact Match: 0.2420
    F1 Score: 0.3892
- k=80:
    Recall@k: 0.2920
    Exact Match: 0.2920
    F1 Score: 0.4342
- k=100:
    Recall@k: 0.3400
    Exact Match: 0.3400
    F1 Score: 0.4820

Hyperparameters Used:
- Learning Rate: 2e-05 (default for transformers)
- Batch Size: 16
- Epochs: 30

Computational Requirements:
- Hardware: GPU
- Models Trained: 6
- Average Epoch Time (last model): 

In [7]:
# Evaluate model
#from sentence_transformers import SentenceTransformer

# Load the saved model from the specified path
#model = SentenceTransformer("saved_models/PubMedBERT")

# Now you can use the model for encoding, querying, etc.

#metrics = evaluate_model(model, test_qs, ks=[1,20,40,60,80,100])
#all_metrics[model_name] = metrics

#print(f"\nEvaluation Metrics for {model_name}:")
#print(f"\nEvaluation Metrics for {model_name}:")
#for k, metrics_at_k in metrics.items():
    #print(f"- k={k}:")
    #for metric_name, value in metrics_at_k.items():
        #print(f"    {metric_name}: {value:.4f}")  # Format the value to 4 decimal places
# 8. Final report
#print("\n=================== Training Summary ===================")
#print(f"Hyperparameters Used:")
#print(f"- Learning Rate: {LEARNING_RATE} (default for transformers)")
#print(f"- Batch Size: {BATCH_SIZE}")
#print(f"- Epochs: {EPOCHS}")
#print(f"\nComputational Requirements:")
#print(f"- Hardware: {'GPU' if device == 'cuda' else 'CPU'}")
#print(f"- Models Trained: {len(models)}")
#print(f"- Average Epoch Time (last model): {epoch_time:.2f}s")
#print(f"- Total GPU Time: {total_gpu_time/3600:.2f} GPU hours")
#print(f"\nTraining Details:")
#print(f"- Loss Function: CosineSimilarityLoss")

