## Experiment Notebook - Data from files

This notebook will illustrate how to use a file as a Datasource

In [1]:
import sqlite3
# Create a connection to the database
db_connection = sqlite3.connect('experiment.db')


## 1. Initialization of Datasources
Here we initialize each datasource with example data.

In [2]:
# Import necessary libraries
from packages.data.src.eval_data.models.datasource import DatasourceModel, DatasourceType

# Create an instance of DatasourceModel
datasource_name = "sample-financial-data"
if datasource := DatasourceModel(db_connection).get_datasource_by_name(datasource_name):
    print(f"Datasource {datasource_name} loaded - ID: {datasource.id}")
else:
    datasource = DatasourceType(name=datasource_name, description="Sample financial data in PDF format")
    datasource.id = DatasourceModel(db_connection).add_datasource(datasource)
    print(f"Datasource {datasource_name} created - ID: {datasource.id}")


Datasource sample-financial-data created - ID: 2


## 2. Read files into documents



In [3]:
from llama_index.core.schema import Document
from typing import List

from eval_scripts.documents import load_documents_from_path, save_documents_to_db

# We will load all pdf documents in the following directory
DATA_PATH = "datasets/fin"

print(f"Loading documents from {DATA_PATH}...")
documents: List[Document] = load_documents_from_path(DATA_PATH)
print(f"Number of documents loaded from {DATA_PATH}: {len(documents)}")


ModuleNotFoundError: No module named 'documents'

# 3. Make Question-Answer set
Generate Questions and Correct Answers from the loaded Documents

# Add Questions and Answers


In [None]:
#!pip install nest_asyncio

In [None]:
import nest_asyncio
nest_asyncio.apply()

from datasets import Dataset

from eval_scripts.generator import generate_testset
from eval_scripts.database import add_or_get_document, add_or_get_qaset

from eval_data.models.question import QuestionModel, QuestionType
from eval_data.models.document import DocumentModel, DocumentType



question_model = QuestionModel(db_connection)
print(f"Processing {len(documents)} documents")
for doc in documents:
    print(f"Processing Document ID: {doc.id_}")

    # Get or Create the Document
    document = add_or_get_document(db_connection, doc, datasource.id)
    
    qaset = add_or_get_qaset(db_connection, doc, datasource.id, document.id)

    # Generate the actual QA Set data
    qa_dataset: Dataset = generate_testset([doc], test_size=5)
    try:
        questions = qa_dataset["question"]
        answers = qa_dataset["ground_truth"]
        print(f"Number of questions: {len(questions)}")
        print(f"Number of answers: {len(answers)}")
    except KeyError:
        print("No questions and answers generated")
        print(qa_dataset.column_names)
        continue

    # Save the questions and answers to the database
    for question, answer in zip(questions, answers):
        question = QuestionType(
            qaset_id=qaset.id,
            document_id=document.id,
            question=question,
            answer=answer,
        )
        question_id = question_model.add_question(question)
        print(f"Question added with ID: {question_id}")