# Experiment Notebook - Data
These Jupyter notebooks demonstrate how to use our data models to run experiments effectively.

This notebook will illustrate how to set up Datasources, Questions, and Correct Answers, for running experiments.


In [1]:
import sqlite3
# Create a connection to the database
db_connection = sqlite3.connect('experiment.db')


## 1. Initialization of Datasources
Here we initialize each datasource with example data.

In [None]:
# Import necessary libraries
from eval_data.models.datasource import DatasourceModel, DatasourceType

# Example data for datasource initialization, adapted from experiment.ipynb
datasource = DatasourceModel(db_connection).add_or_get_datasource(
    DatasourceType(name="BioASQ", description="Manually curated set of biomedical Documents, Questions, and Answers")
)
print(f"Datasource ID: {datasource.id}")



## 2. Document Handling
Adding documents to the datasources using Hugging Face paths.

In [None]:
# Import necessary libraries
from eval_data.models.document import DocumentModel, DocumentType

# Create an instance of DocumentModel
document_model = DocumentModel(db_connection)

# Example documents to be added, using Hugging Face paths
document = document_model.add_or_get_document(
    DocumentType(name="BioASQ Document 1", location="rag-datasets/rag-mini-bioasq;text-corpus", datasource_id=1)
)

print(f"Document ID: {document.id}")


## 3. QASet Creation
Establishing QA sets that contain questions and their correct answers linked to specific documents.

In [None]:
# Import necessary libraries
from eval_data.models.qaset import QASetModel, QASetType

# Create a QA Set
qaset = QASetModel(db_connection).add_or_get_qaset(
    QASetType(
        datasource_id=datasource.id,
        document_id=document.id,
        name="BioASQ QA Set",
        location="rag-datasets/rag-mini-bioasq;question-answer-passages",
    )
)
print(f"QA Set ID: {qaset.id}")


## 4. Question Generation
Generating or loading questions from a QASet source.

In [None]:
from datasets import load_dataset
from eval_data.models.qaset import QASetModel, QASetType
from eval_data.models.question import QuestionModel, QuestionType

# Load the QA dataset
path, name = qaset.location.split(";")
dataset = load_dataset(path, name)['test']

test_questions = dataset["question"]
if "ground_truth" in dataset.column_names:
    test_answers = dataset["ground_truth"]
else:
    test_answers = dataset["answer"]

print(f"Loaded {len(test_questions)} questions")
print(f"Loaded {len(test_answers)} answers")


# Save the questions and answers to the database
question_model = QuestionModel(db_connection)
existing_questions = [ q.question for q in question_model.get_questions_by_qaset_id(qaset.id) ]
count_skipped = 0
count_added = 0
for question, answer in zip(test_questions, test_answers):
    if not question or question in existing_questions:
        count_skipped += 1
        continue

    question = QuestionType(
        qaset_id=qaset.id,
        document_id=document.id,
        question=question,
        answer=answer,
    )
    question_id = question_model.add_question(question)
    count_added += 1

print(f"Added {count_added} questions")
print(f"Skipped {count_skipped} existing questions")
