## Indexing Documents into Pinecone

In [83]:
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os
from typing import List
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
pc = Pinecone(os.getenv("PINECONE_API_KEY"))

#### Create Index

In [87]:
index_name = "nlp-project"
if index_name not in [index['name'] for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,  # dimension for text-embeddings-3-small
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)

#### Generate Embeddings

In [88]:
def get_embedding(text: str):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

#### Index Vector Store

In [97]:
def index_documents(documents: List[dict]):
    n = len(documents)

    for i, doc in enumerate(documents, 1):
        print(f"Indexing document {i}/{n}")
        
        embedding = get_embedding(doc['text'])
        vector = {
            'id': doc['id'],
            'values': embedding,
            'metadata': {
                'title': doc['title'],
                'domain': doc['domain'],
                'link': doc['link']
            }
        }
    
        # Upsert to Pinecone
        index.upsert(vectors=[vector], namespace="ns1")

def index_halu_documents(documents: List[dict]):
    n = len(documents)

    for i, doc in enumerate(documents, 1):
        print(f"Indexing document {i}/{n}")
        
        embedding = get_embedding(doc['text'])
        vector = {
            'id': doc['id'],
            'values': embedding,
        }
    
        # Upsert to Pinecone
        index.upsert(vectors=[vector], namespace="ns2")

def index_fever_documents(documents: List[dict]):
    n = len(documents)

    for i, doc in enumerate(documents, 1):
        print(f"Indexing document {i}/{n}")
        
        embedding = get_embedding(doc['text'])
        vector = {
            'id': doc['id'],
            'values': embedding,
            'metadata': {
                'title': doc['title'],
                'link': doc['link']
            }
        }
    
        # Upsert to Pinecone
        index.upsert(vectors=[vector], namespace="ns3")

#### Load Documents

In [43]:
import json

def load_dict_from_json(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading dictionary: {e}")
        return None

domain_docs = load_dict_from_json('truthfulqa_domain_docs.json')

In [90]:
halueval_knowledge = load_dict_from_json('halueval_knowledge.json')

In [96]:
fever_wiki_pages = load_dict_from_json('fever_wiki_pages.json')

#### Divide document text into chunks to fit into vector database

In [44]:
def chunk_text_for_embedding(text, max_tokens=6000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(word) + 1  # +1 for the space
        if current_length + word_length > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += word_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [45]:
docs_to_index = []
document_store = {}

for i, domain in enumerate(domain_docs, 1):
    for j, doc in enumerate(domain_docs[domain], 1):

        chunks = chunk_text_for_embedding(doc['doc'])
        for k, chunk in enumerate(chunks, 1):
            docs_to_index.append({
                'id': f"{i}#{j}#{k}",
                'title': doc['title'],
                'domain': domain,
                'text': chunk,
                'link': doc['link']
            })
            document_store[f"{i}#{j}#{k}"] = chunk

In [91]:
halu_docs_to_index = []
halu_document_store = {}

for i, doc in enumerate(halueval_knowledge, 1):

    chunks = chunk_text_for_embedding(doc['knowledge'])
    for k, chunk in enumerate(chunks, 1):
        halu_docs_to_index.append({
            'id': f"{doc['id']}#{k}",
            'text': chunk
        })
        halu_document_store[f"{doc['id']}#{k}"] = chunk

In [101]:
fever_docs_to_index = []
fever_document_store = {}

for i, domain in enumerate(fever_wiki_pages, 1):
    if len(fever_wiki_pages[domain]) == 1:
        doc = fever_wiki_pages[domain][0]

        chunks = chunk_text_for_embedding(doc['doc'])
        for k, chunk in enumerate(chunks, 1):
            fever_docs_to_index.append({
                'id': f"{i}#{k}",
                'title': doc['title'],
                'text': chunk,
                'link': doc['link']
            })
            fever_document_store[f"{i}#{k}"] = chunk

### Index documents into vector database

In [80]:
index_documents(docs_to_index)

Indexing document 1/19624
Indexing document 2/19624
Indexing document 3/19624
Indexing document 4/19624
Indexing document 5/19624
Indexing document 6/19624
Indexing document 7/19624
Indexing document 8/19624
Indexing document 9/19624
Indexing document 10/19624
Indexing document 11/19624
Indexing document 12/19624
Indexing document 13/19624
Indexing document 14/19624
Indexing document 15/19624
Indexing document 16/19624
Indexing document 17/19624
Indexing document 18/19624
Indexing document 19/19624
Indexing document 20/19624
Indexing document 21/19624
Indexing document 22/19624
Indexing document 23/19624
Indexing document 24/19624
Indexing document 25/19624
Indexing document 26/19624
Indexing document 27/19624
Indexing document 28/19624
Indexing document 29/19624
Indexing document 30/19624
Indexing document 31/19624
Indexing document 32/19624
Indexing document 33/19624
Indexing document 34/19624
Indexing document 35/19624
Indexing document 36/19624
Indexing document 37/19624
Indexing d

In [93]:
index_halu_documents(halu_docs_to_index)

Indexing document 1/10000
Indexing document 2/10000
Indexing document 3/10000
Indexing document 4/10000
Indexing document 5/10000
Indexing document 6/10000
Indexing document 7/10000
Indexing document 8/10000
Indexing document 9/10000
Indexing document 10/10000
Indexing document 11/10000
Indexing document 12/10000
Indexing document 13/10000
Indexing document 14/10000
Indexing document 15/10000
Indexing document 16/10000
Indexing document 17/10000
Indexing document 18/10000
Indexing document 19/10000
Indexing document 20/10000
Indexing document 21/10000
Indexing document 22/10000
Indexing document 23/10000
Indexing document 24/10000
Indexing document 25/10000
Indexing document 26/10000
Indexing document 27/10000
Indexing document 28/10000
Indexing document 29/10000
Indexing document 30/10000
Indexing document 31/10000
Indexing document 32/10000
Indexing document 33/10000
Indexing document 34/10000
Indexing document 35/10000
Indexing document 36/10000
Indexing document 37/10000
Indexing d

In [103]:
index_fever_documents(fever_docs_to_index)

Indexing document 1/50167
Indexing document 2/50167
Indexing document 3/50167
Indexing document 4/50167
Indexing document 5/50167
Indexing document 6/50167
Indexing document 7/50167
Indexing document 8/50167
Indexing document 9/50167
Indexing document 10/50167
Indexing document 11/50167
Indexing document 12/50167
Indexing document 13/50167
Indexing document 14/50167
Indexing document 15/50167
Indexing document 16/50167
Indexing document 17/50167
Indexing document 18/50167
Indexing document 19/50167
Indexing document 20/50167
Indexing document 21/50167
Indexing document 22/50167
Indexing document 23/50167
Indexing document 24/50167
Indexing document 25/50167
Indexing document 26/50167
Indexing document 27/50167
Indexing document 28/50167
Indexing document 29/50167
Indexing document 30/50167
Indexing document 31/50167
Indexing document 32/50167
Indexing document 33/50167
Indexing document 34/50167
Indexing document 35/50167
Indexing document 36/50167
Indexing document 37/50167
Indexing d

## Test Retriever for a sample query

In [76]:
def retrieve_documents(query_text: str, namespace: str = "ns1", top_k: int = 15, score_threshold: float = 0.3):
    
    query_embedding = get_embedding(query_text)
    
    # Query Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        namespace=namespace,
        include_metadata=True
    )
    
    # Filter results by score threshold
    relevant_documents = [
        {
            'id': match.id,
            'score': match.score,
            'title': match.metadata['title'],
            'domain': match.metadata['domain'],
            'text': document_store.get(match.id, "Text not found")
        }
        for match in results.matches if match.score > score_threshold
    ]
    
    return relevant_documents

In [None]:
query = "Who is strong?"
score_threshold = 0.3
relevant_docs = retrieve_documents(query, score_threshold=score_threshold)

print("\nRelevant documents:")
for doc in relevant_docs:
    print(f"\nID: {doc['id']}")
    print(f"Title: {doc['title']}")
    print(f"Domain: {doc['domain']}")
    print(f"Score: {doc['score']:.4f}")
    print(f"Text: {doc['text']}")


Relevant documents:

ID: 1#2#3
Title: Neil Armstrong
Domain: Neil Armstrong biography
Score: 0.1347
Text: Angeles. After one semester, they moved into a house in Antelope Valley, near Edwards AFB. Janet did not finish her degree, a fact she regretted later in life. The couple had three children.[39] In June 1961, their daughter Karen was diagnosed with diffuse intrinsic pontine glioma, a malignant tumor of the middle part of her brain stem.[40] X-ray treatment slowed its growth, but her health deteriorated to the point where she could no longer walk or talk. She died of pneumonia, related to her weakened health, on January 28, 1962, aged two.[41] ## Test pilot Following his graduation from Purdue, Armstrong became an experimental research test pilot. He applied at the National Advisory Committee for Aeronautics (NACA) High-Speed Flight Station at Edwards Air Force Base.[42] NACA had no open positions, and forwarded his application to the Lewis Flight Propulsion Laboratory in Cleveland

### Save Documents in json file for future reference

In [82]:
def save_dict_to_json(dictionary, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(dictionary, f, indent=4, ensure_ascii=False)
        print(f"Dictionary saved to {filename}")
    except Exception as e:
        print(f"Error saving dictionary: {e}")

save_dict_to_json(document_store, 'truthfulQA_documentStore.json')

Dictionary saved to truthfulQA_documentStore.json


In [94]:
save_dict_to_json(halu_document_store, 'haluEval_documentStore.json')

Dictionary saved to haluEval_documentStore.json


In [104]:
save_dict_to_json(fever_document_store, 'fever_documentStore.json')

Dictionary saved to fever_documentStore.json
