# Initialize HuggingFace LLM

In [63]:
# !pip install langchain-community langchain-text-splitters langchain-chroma
# !pip install langchain-core
# !pip install langchain-google-genai
# !pip install pypdf

In [64]:
import os
import time
import random
import uuid

In [65]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.schema import format_document
from langchain.schema import Document


In [66]:
google_api_key = os.getenv("GOOGLE_API_KEY")

In [67]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    temperature=0.1,
    max_output_tokens=512,
    google_api_key=google_api_key
)

In [68]:
emb = GoogleGenerativeAIEmbeddings(model="text-embedding-004", google_api_key=google_api_key)

In [69]:
pdf_path = r"C:\Users\VICTUS\Desktop\ \RAG\RAG_QnA_Application\TechBloom_Matha_1.0.pdf"

In [70]:

loader = PyPDFLoader(pdf_path)
docs = loader.load()
assert docs, "No documents found"

In [71]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)
splits = text_splitter.split_documents(docs)
assert splits, "No text splits found"

In [72]:
vector_store = Chroma(
    collection_name="My_collection",
    persist_directory="./chroma_db",
    embedding_function=emb
)

In [73]:
def batched(iterable, n):
    for i in range(0, len(iterable),n):
        yield iterable[i:i+n]

def back_off(retry):
    time.sleep(min(60,(2*retry)+random.random()))

batch_size = 32
for batch in batched(splits, batch_size):
    ids = [getattr(doc, "id", None) or str(uuid.uuid4()) for doc in batch]
    texts = [doc.page_content for doc in batch]
    metas = [doc.metadata for doc in batch]
    retry = 0
    while True:
        try:
            vector_store.add_texts(texts=texts, metadatas=metas, ids=ids)
            break
        except Exception as e:
            msg = str(e)
            if "429" in msg or "quota" in msg.lower():
                back_off(retry)
                retry += 1
                continue
            raise
        
# vector_store.persist()

In [74]:
retriever = vector_store.as_retriever(search_kwargs={"k":4})

In [75]:
def format_docs(docs: list[Document]) -> str:
    return "\n\n---\n\n".join(
        f"[p{d.metadata.get('page', '?')}] {d.page_content}" for d in docs
    )

def format_chat_history(history: list[str]) -> str:
    # history assumed like [user, ai, user, ai, ...]
    formatted = []
    for human, ai in zip(history[::2], history[1::2]):
        formatted.append(f"Human: {human}\nAI: {ai}")
    return "\n".join(formatted)

In [76]:
template = """You are a helpful AI assistant. Answer the question based on the context below and the conversation history. If you don't know the answer, say you don't know.

Context:
{context}

Conversation History:
{history}

Question: {question}

Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [77]:
chain = (
    {
        "context": RunnableLambda(lambda x: format_docs(retriever.invoke(x["question"]))),
        "question": RunnablePassthrough(),
        "history": RunnableLambda(lambda x: format_chat_history(x.get("history", []))),
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
conversation_history = []
while True:
    user_input = input("Yuraj Isurinda : ")
    if user_input.lower() in {"exit", "quit"}:
        break

    response = chain.invoke({"question": user_input, "history": conversation_history})
    print(f"AI: {response}")
    conversation_history.extend([user_input, response])

AI: Hi
AI: Based on the provided text, the document discusses the planning stages of a startup.  The startup has not yet registered a company.  There's also a mention that "YGC" will be a launching pad, suggesting it's an incubator or accelerator program.
