In [None]:

import os
import time
import numpy as np
from typing import List
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.embeddings import Embeddings
from langchain_pinecone import PineconeVectorStore
from huggingface_hub import InferenceClient

import chromadb


chroma_client = chromadb.Client()



# load the documents from the PDF directory
loader = PyPDFDirectoryLoader("./pdfs")
documents = loader.load()

# Next step is embedding 

# Creating database


# stirong in a directory and operating search operaions



In [12]:
import os
import numpy as np
from typing import List
from dotenv import load_dotenv

# LangChain Imports
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Chroma
from huggingface_hub import InferenceClient

# 1. LOAD KEYS
load_dotenv()

hf_api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not hf_api_key:
    raise ValueError("Error: HUGGINGFACEHUB_API_TOKEN is missing from .env file.")

# 2. DEFINE EMBEDDING CLASS (Same Robust Class as before)
class RobustHuggingFaceEmbeddings(Embeddings):
    def __init__(self, api_key, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.client = InferenceClient(token=api_key)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings_list = []
        for text in texts:
            try:
                response = self.client.feature_extraction(text, model=self.model_name)
                if isinstance(response, np.ndarray):
                    if response.ndim == 2:
                        embedding = np.mean(response, axis=0).tolist()
                    elif response.ndim == 1:
                        embedding = response.tolist()
                    embeddings_list.append(embedding)
                elif isinstance(response, list):
                    if len(response) > 0 and isinstance(response[0], list):
                         embedding = np.mean(response, axis=0).tolist()
                    else:
                         embedding = response
                    embeddings_list.append(embedding)
            except Exception as e:
                print(f"Error: {e}")
                raise
        return embeddings_list

    def embed_query(self, text: str) -> List[float]:
        result = self.embed_documents([text])
        return result[0]

print("Connecting to Embeddings...")
embeddings = RobustHuggingFaceEmbeddings(api_key=hf_api_key)

# 3. LOAD & SPLIT DOCUMENTS
print("Loading PDFs from ./pdfs ...")
loader = PyPDFDirectoryLoader(".././pdfs")
documents = loader.load()

if not documents:
    print("Warning: No documents found.")
else:
    print(f"Loaded {len(documents)} pages.")
    
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    final_documents = text_splitter.split_documents(documents)
    print(f"Split into {len(final_documents)} chunks.")

    # 4. CREATE CHROMADB (LOCAL VECTOR STORE)
    # We persist the data to a folder named 'db' so we don't have to reload every time
    persist_directory = "./chroma_db"
    
    print("Creating/Updating ChromaDB...")
    
    # This automatically:
    # 1. Embeds the documents using our custom class
    # 2. Stores them in the './chroma_db' folder
    vectordb = Chroma.from_documents(
        documents=final_documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    
    print("‚úÖ Database created and saved locally.")

    # 5. PERFORM SIMILARITY SEARCH
    query = "What is the summary of the document?"
    print(f"\nüîç Searching for: '{query}'")
    
    # Get top 3 matches
    results = vectordb.similarity_search(query, k=3)
    
    print("\n--- Search Results ---")
    for i, doc in enumerate(results):
        print(f"\n[Result {i+1}]")
        print(doc.page_content[:200] + "...")

Connecting to Embeddings...
Loading PDFs from ./pdfs ...
Loaded 15 pages.
Split into 52 chunks.
Creating/Updating ChromaDB...
‚úÖ Database created and saved locally.

üîç Searching for: 'What is the summary of the document?'

--- Search Results ---

[Result 1]
[25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated
corpus of english: The penn treebank. Computational linguistics, 19(2):313‚Äì330, 1993.
[26] David McCl...

[Result 2]
3.2 Attention
An attention function can be described as mapping a query and a set of key-value pairs to an output,
where the query, keys, values, and output are all vectors. The output is computed as ...

[Result 3]
Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base
model. All metrics are on the English-to-German translation development set, newstest2013. Liste...


In [None]:

# retrieval QA system with ChromaDB and HuggingFace Embeddings complete.
# call grok api clien and create chain , retrival qa from chain types
