In [1]:
# Setup
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("openai_key")

if not openai_api_key:
    print("⚠️ Warning: OPENAI_API_KEY not found. Set it in .env file.")
else:
    print("✅ API key loaded successfully")

✅ API key loaded successfully


In [2]:
#now i want to chunk the pdf file 
# to chunk i first need to know what the content of the pdf is , so i need something to read the pdf 
import PyPDF2
import os

from langchain_core.documents import Document

documents=[]

file_path=r'C:\Users\User\Desktop\Working_with_LLMS\RAG\files\Personal and Professional Summary.pdf'
file_name= os.path.basename(file_path)
chunks=[]
with open(file_path,'rb') as f:
    pdf_reader=PyPDF2.PdfReader(f)
    num_pages= len(pdf_reader.pages)

    for page_num in range(num_pages):
        page=pdf_reader.pages[page_num]
        text= page.extract_text()
        documents.append(Document(page_content=text))


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

# Split documents
chunks = text_splitter.split_documents(documents)

print(f"Split {len(documents)} documents into {len(chunks)} chunks")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1}: {chunk.page_content}")

  from .autonotebook import tqdm as notebook_tqdm


Split 3 documents into 96 chunks

Chunk 1: Personal and Professional Summary

Chunk 2: My academic and professional journey h as been shaped by a deep curiosity about how

Chunk 3: systems work, how they fail, and how they can be optimized to serve people better. I am an


In [4]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=openai_api_key
)


In [5]:
from langchain_community.vectorstores import Chroma

# persist_directory allows saving DB locally; optional
vectorstore = Chroma.from_documents(
    chunks,
    embeddings,
    collection_name="my_rag_collection",
    persist_directory="./chroma_db"  # optional
)

print(f"✅ Vector store created")

# Test similarity search
query = "Where is he serving as a youth corper?"
results = vectorstore.similarity_search(query, k=2)

print(f"\nQuery: {query}")
for i, doc in enumerate(results):
    print(f"\nResult {i+1}: {doc.page_content}")

✅ Vector store created

Query: Where is he serving as a youth corper?

Result 1: I am currently serving as a youth corps member in Nigeria, where I als o hold leadership

Result 2: Corpers Association (Ogun State branch) and within the lodge, I was responsible for ensuring


In [8]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Create LLM
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=openai_api_key
)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.prompts import MessagesPlaceholder

# Store for chat histories
chat_store = {}

def get_session_history(session_id: str):
    if session_id not in chat_store:
        chat_store[session_id] = InMemoryChatMessageHistory()
    return chat_store[session_id]

# Helper function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create conversational prompt
conv_prompt = ChatPromptTemplate.from_messages([
    ("system", f"""
You are a helpful AI assistant. Answer the user's question based on the context provided below.

Important guidelines:
- Use ONLY information from the context
- If the answer is not in the context, say "I don't have enough information to answer this question."
- Be concise and accurate"""),
MessagesPlaceholder(variable_name="chat_history"),
("human", "Context: {context}\n\nQuestion: {question}")
])

# Build base chain
conv_chain_base = (
    RunnableParallel(
        context=lambda x: format_docs(retriever.invoke(x["question"])),
        question=lambda x: x["question"],
        chat_history=lambda x: x.get("chat_history", [])
    )
    | conv_prompt
    | llm
    | StrOutputParser()
)

# Wrap with message history
conv_chain = RunnableWithMessageHistory(
    conv_chain_base,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history"
)

print("✅ Conversational chain created")

✅ Conversational chain created


In [9]:
# First question
response1 = conv_chain.invoke(
    {"question": "What is FAISS?"},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q1: What is FAISS?")
print(f"A1: {response1}\n")


Q1: What is FAISS?
A1: I don't have enough information to answer this question.



In [10]:
# Follow-up question (remembers context)
response2 = conv_chain.invoke(
    {"question": "Where is he serving as a youth corper?"},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q2: Who developed it?")
print(f"A2: {response2}")

Q2: Who developed it?
A2: The user is serving as a youth corps member in Nigeria.


In [11]:
# Follow-up question (remembers context)
response2 = conv_chain.invoke(
    {"question": "What is his name?"},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q2: what is is name?")
print(f"A2: {response2}")

Q2: what is is name?
A2: I don't have enough information to answer this question.


In [12]:
# View chat history
session = get_session_history("session1")
print("Chat History:")
for msg in session.messages:
    print(f"\n{msg.type}: {msg.content}")

Chat History:

human: What is FAISS?

ai: I don't have enough information to answer this question.

human: Where is he serving as a youth corper?

ai: The user is serving as a youth corps member in Nigeria.

human: What is his name?

ai: I don't have enough information to answer this question.
