In [15]:
import json 
with open('metadata.jsonl', 'r') as f: 
    json_list = list(f)

json_QA = []
for json_str in json_list: 
    json_data = json.loads(json_str)
    json_QA.append(json_data)

In [20]:
import random
random_samples = random.sample(json_QA, 1)
for sample in random_samples:
    print("=" * 50)
    print(f"Task ID: {sample['task_id']}")
    print(f"Question: {sample['Question']}")
    print(f"Level: {sample['Level']}")
    print(f"Final Answer: {sample['Final answer']}")
    print(f"Annotator Metadata: ")
    print(f"  ├── Steps: ")
    for step in sample['Annotator Metadata']['Steps'].split('\n'):
        print(f"  │      ├── {step}")
    print(f"  ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}")
    print(f"  ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}")
    print(f"  ├── Tools:")
    for tool in sample['Annotator Metadata']['Tools'].split('\n'):
        print(f"  │      ├── {tool}")
    print(f"  └── Number of tools: {sample['Annotator Metadata']['Number of tools']}")
print("=" * 50)

Task ID: 23dd907f-1261-4488-b21c-e9185af91d5e
Question: In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?
Level: 1
Final Answer: 2
Annotator Metadata: 
  ├── Steps: 
  │      ├── 1. Search the web for “Audre Lorde Father Son and Holy Ghost”.
  │      ├── 2. Click on Poetry Foundation result.
  │      ├── 3. Note the stanza that appears to have lines indented, the second one.
  │      ├── 4. Return to search results to confirm.
  │      ├── 5. Click on second result.
  │      ├── 6. Confirm that the indentation appears in the second stanza here as well.
  ├── Number of steps: 6
  ├── How long did this take?: 5 minutes
  ├── Tools:
  │      ├── 1. Search engine
  │      ├── 2. Web browser
  └── Number of tools: 2


In [29]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from dotenv import load_dotenv 
import os 


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings()

In [30]:
%pip install langchain-core

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
import faiss 
from langchain_community.docstore.in_memory import InMemoryDocstore 
from langchain_community.vectorstores import FAISS 

embedding_dim = len(embeddings.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim) 

vector_store = FAISS(
    embedding_function=embeddings, 
    index=index,
    docstore=InMemoryDocstore({}), 
    index_to_docstore_id={},
)

In [40]:
# Add items to vector store 

from uuid import uuid4 
from langchain_core.documents import Document 

docs = []

for sample in json_QA:
    content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}"
    doc = Document(
        page_content=content,
        metadata={
            "source": sample["task_id"],
            "level": sample["Level"]
        },
        embedding=embeddings.embed_query(content)
    )
    docs.append(doc)

uuids = [str(uuid4()) for _ in range(len(docs))]

vector_store.add_documents(documents=docs, ids=uuids)
print(f"Added {len(docs)} documents to the vector store.")

Added 165 documents to the vector store.


In [42]:
query = "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?"


vector_store.similarity_search(
    query,
    k=2
)

[Document(id='54b6dffb-9c44-4c7e-ac23-95bd8be023c0', metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d', 'level': 1}, page_content='Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n\nFinal answer: 80GSFC21M0002'),
 Document(id='bf282293-caef-4e39-b157-6df5c8bb072f', metadata={'source': '0bdb7c40-671d-4ad1-9ce3-986b159c0ddc', 'level': 3}, page_content="Question: In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of, which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? 

In [43]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1}) 
retriever.invoke(query)

[Document(id='54b6dffb-9c44-4c7e-ac23-95bd8be023c0', metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d', 'level': 1}, page_content='Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n\nFinal answer: 80GSFC21M0002')]