In [6]:
from langchain_community.document_loaders import JSONLoader
import json 
from langchain_community.docstore.document import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid
from glob import glob
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import time
from dotenv import load_dotenv
import os 

load_dotenv()

ModuleNotFoundError: No module named 'langchain_groq'

In [None]:
file_path = r"C:\Users\bhavi\OneDrive\Desktop\langhcain_learning\RAG\rag_docs\wikidata_rag_demo.jsonl"

In [None]:
loader = JSONLoader(file_path=file_path,
                    jq_schema=".",
                    text_content=False,
                    json_lines=True)

wiki_docs = loader.load()
print("LENGTH OF DOCS ----",len(wiki_docs))
print(wiki_docs[3])

In [None]:
# loading ddata from json 
wikipedia_documents = []

for doc in wiki_docs:
    doc = json.loads(doc.page_content)
    meta_data = {"title":doc["title"],
                 "id":doc["id"],
                 "source":"wikipedia",
                 "page":1
                 }
    
    data = " ".join(doc["paragraphs"])
    wikipedia_documents.append(Document(page_content=data ,metadata=meta_data))

In [None]:
wikipedia_documents[1]

In [None]:
# Loading model 

api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(model="qwen/qwen3-32b",
                 temperature=0,
                 max_tokens=None,
                 api_key=api_key,
                 timeout=None,
                 max_retries=2,
                 )


In [None]:
def create_standard_chunks(file_path, chunk_size=1500, chunk_overlap=150):
    print("Loading Pages:", file_path)
    loader = PyMuPDFLoader(file_path)
    doc_pages = loader.load()

    print("Chunking pages...", file_path)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    doc_chunks = splitter.split_documents(doc_pages)

    standard_chunks = []

    for chunk in doc_chunks:
        chunk_metadata_upd = {
            "id": str(uuid.uuid4()),
            "page": chunk.metadata.get("page"),
            "source": file_path,
            "title": os.path.basename(file_path) 
        }

        standard_chunks.append(Document(
            page_content=chunk.page_content,
            metadata=chunk_metadata_upd
        ))
        
    print("Finished processing --------", file_path)
    return standard_chunks

In [None]:
pdf_files = glob("C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs/*.pdf")
print(pdf_files)

In [None]:
paper_docs = []

for fp in pdf_files:
    paper_docs.extend(create_standard_chunks(file_path=fp,chunk_size=1500))

In [None]:
total_chunks = wikipedia_documents + paper_docs
print("------ Lenght of Documents ---------",len(total_chunks))

In [None]:
# Indexing Documents and chunk embeddings in Vector DB 
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en')

chroma_db = Chroma.from_documents(documents=total_chunks,
                                  embedding=embedding_model,
                                  collection_metadata={"hnsw:space":"cosine"},
                                  persist_directory="./wikipedia_db")

print("[----EMBEDDINGS CREATED ---------]")

In [None]:
# doing similarity based retrieval 
from langchain_community.retrievers import BM25Retriever 
similarity_retriever = chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k":5})

bm25_retrievers = BM25Retriever.from_documents(documents=total_chunks,
                                               k=5)
print("--- Similarity and bm25 Retriever initalizes ----")


In [None]:
# build ensemble Retriever
import sys
print(sys.executable)



In [None]:
import pkgutil

print(any(m.name == "langchain_community" for m in pkgutil.iter_modules()))
