In [3]:
import json 
with open('metadata.jsonl', 'r') as f: 
    json_list = list(f)

json_QA = []
for json_str in json_list: 
    json_data = json.loads(json_str)
    json_QA.append(json_data)

In [4]:
import random
random_samples = random.sample(json_QA, 1)
for sample in random_samples:
    print("=" * 50)
    print(f"Task ID: {sample['task_id']}")
    print(f"Question: {sample['Question']}")
    print(f"Level: {sample['Level']}")
    print(f"Final Answer: {sample['Final answer']}")
    print(f"Annotator Metadata: ")
    print(f"  ├── Steps: ")
    for step in sample['Annotator Metadata']['Steps'].split('\n'):
        print(f"  │      ├── {step}")
    print(f"  ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}")
    print(f"  ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}")
    print(f"  ├── Tools:")
    for tool in sample['Annotator Metadata']['Tools'].split('\n'):
        print(f"  │      ├── {tool}")
    print(f"  └── Number of tools: {sample['Annotator Metadata']['Number of tools']}")
print("=" * 50)

Task ID: 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3
Question: Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.

In your response, please only list the ingredients, not any measurements. So if the recipe calls for "a pinch of salt" or "two cups of ripe strawberries" the ingredients on the list would be "salt" and "ripe strawberries".

Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.
Level: 1
Final Answer: cornstarch, freshly squeezed lemon 

In [5]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from dotenv import load_dotenv 
import os 


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings()

In [6]:
%pip install langchain-core

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import faiss 
from langchain_community.docstore.in_memory import InMemoryDocstore 
from langchain_community.vectorstores import FAISS 

embedding_dim = len(embeddings.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim) 

vector_store = FAISS(
    embedding_function=embeddings, 
    index=index,
    docstore=InMemoryDocstore({}), 
    index_to_docstore_id={},
)

In [13]:
# Add items to vector store 

from uuid import uuid4 
from langchain_core.documents import Document 

docs = []

for sample in json_QA:
    content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}"
    doc = Document(
        page_content=content,
        metadata={
            "source": sample["task_id"],
            "level": sample["Level"]
        }
    )
    docs.append(doc)

uuids = [str(uuid4()) for _ in range(len(docs))]

vector_store.add_documents(documents=docs, ids=uuids)
print(f"Added {len(docs)} documents to the vector store.")

Added 165 documents to the vector store.


In [14]:
vector_store.save_local("vector_store")

In [9]:
query = "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?"


vector_store.similarity_search(
    query,
    k=2
)

[Document(id='eafe1a17-a3f9-444b-9119-6c96f40915b8', metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d', 'level': 1}, page_content='Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n\nFinal answer: 80GSFC21M0002'),
 Document(id='a17c8559-b10c-4e7b-8f5f-472d24b0fe3c', metadata={'source': '0bdb7c40-671d-4ad1-9ce3-986b159c0ddc', 'level': 3}, page_content="Question: In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of, which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? 

In [10]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1}) 
retriever.invoke(query)

[Document(id='eafe1a17-a3f9-444b-9119-6c96f40915b8', metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d', 'level': 1}, page_content='Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n\nFinal answer: 80GSFC21M0002')]

In [16]:
vector_store_loaded = FAISS.load_local("vector_store", embeddings, allow_dangerous_deserialization=True)

In [17]:
from re import search


vector_store_loaded.as_retriever(search_type="mmr", search_kwargs={"k": 1}).invoke(query)

[Document(id='66eabedd-7236-4f0f-8996-5ca2c88f2779', metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d', 'level': 1}, page_content='Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n\nFinal answer: 80GSFC21M0002')]