In [1]:
# Clone the SleepQA repo
!git clone https://github.com/IvaBojic/SleepQA.git
%cd SleepQA

# Install key dependencies
!pip install transformers faiss-cpu datasets scikit-learn pandas tqdm


Cloning into 'SleepQA'...
remote: Enumerating objects: 400, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 400 (delta 46), reused 43 (delta 19), pack-reused 313 (from 1)[K
Receiving objects: 100% (400/400), 31.13 MiB | 6.17 MiB/s, done.
Resolving deltas: 100% (176/176), done.
Updating files: 100% (134/134), done.
Filtering content: 100% (3/3), 1.21 GiB | 36.20 MiB/s, done.
/content/SleepQA
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-an

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time

# 1. Load data
corpus = pd.read_csv("data/training/sleep-corpus.tsv", sep="\t", header=None)
corpus.columns = ['index', 'passage', 'title']
test_qs = pd.read_csv("data/training/sleep-test.csv", sep="\t", header=None)
test_qs.columns = ['question', 'answer']

# 2. Define TF-IDF search function
def query_tfidf(question, vectorizer, tfidf_corpus, top_k=3):
    question_vec = vectorizer.transform([question])
    sims = cosine_similarity(question_vec, tfidf_corpus).flatten()
    top_indices = sims.argsort()[-top_k:][::-1]
    return [corpus.iloc[i]["passage"] for i in top_indices], sims[top_indices]

# 3. Define evaluation metrics
def evaluate_model(vectorizer, tfidf_corpus, test_df, ks=[1]):
    results = {}

    for k in ks:
        recall_at_k = []
        exact_match = []
        f1_scores = []

        for _, row in test_df.iterrows():
            question = row["question"]
            true_answer = row["answer"]

            retrieved_passages, _ = query_tfidf(question, vectorizer, tfidf_corpus, top_k=k)

            # Recall@k: if true answer appears in any retrieved passage
            match_found = any(true_answer.strip().lower() in passage.lower() for passage in retrieved_passages)
            recall_at_k.append(1 if match_found else 0)

            # Exact Match
            em = any(true_answer.strip().lower() == passage.strip().lower() for passage in retrieved_passages)
            exact_match.append(1 if em else 0)

            # F1 score (token level) with best candidate
            best_f1 = 0
            for passage in retrieved_passages:
                true_tokens = set(true_answer.lower().split())
                pred_tokens = set(passage.lower().split())
                common = true_tokens & pred_tokens
                if not common:
                    continue
                precision = len(common) / len(pred_tokens)
                recall = len(common) / len(true_tokens)
                f1 = 2 * precision * recall / (precision + recall)
                best_f1 = max(best_f1, f1)
            f1_scores.append(best_f1)

        # Store results for each k
        results[k] = {
            "Recall@k": np.mean(recall_at_k),
            "Exact Match": np.mean(exact_match),
            "F1 Score": np.mean(f1_scores)
        }

    return results

# 4. Evaluate TF-IDF Baseline Model
print("\n### Evaluating TF-IDF Baseline ###")
vectorizer = TfidfVectorizer()
tfidf_corpus = vectorizer.fit_transform(corpus["passage"].tolist())

# Evaluate the TF-IDF model
tfidf_metrics = evaluate_model(vectorizer, tfidf_corpus, test_qs, ks=[1, 20, 40, 60, 80, 100])

# 5. Final report
print("\n=================== Training Summary ===================")
print(f"Hyperparameters Used:")
print(f"- Batch Size: N/A (using simple TF-IDF retrieval)")
print(f"- Evaluation Metrics: Recall@k, Exact Match, F1 Score")

# Adjusted printing for nested dictionary
print(f"\nEvaluation Metrics for TF-IDF:")
for k, metrics in tfidf_metrics.items():
    print(f"\nFor k = {k}:")
    for metric_name, value in metrics.items():
        print(f"- {metric_name}: {value:.4f}")




### Evaluating TF-IDF Baseline ###

Hyperparameters Used:
- Batch Size: N/A (using simple TF-IDF retrieval)
- Evaluation Metrics: Recall@k, Exact Match, F1 Score

Evaluation Metrics for TF-IDF:

For k = 1:
- Recall@k: 0.0000
- Exact Match: 0.0000
- F1 Score: 0.0917

For k = 20:
- Recall@k: 0.0000
- Exact Match: 0.0000
- F1 Score: 0.1308

For k = 40:
- Recall@k: 0.0000
- Exact Match: 0.0000
- F1 Score: 0.1347

For k = 60:
- Recall@k: 0.0000
- Exact Match: 0.0000
- F1 Score: 0.1356

For k = 80:
- Recall@k: 0.0000
- Exact Match: 0.0000
- F1 Score: 0.1368

For k = 100:
- Recall@k: 0.0000
- Exact Match: 0.0000
- F1 Score: 0.1370
