In [8]:
%pwd

'd:\\BWU_Chatbot\\research'

In [11]:
import os
os.chdir('../')

In [12]:
%pwd

'd:\\BWU_Chatbot'

In [13]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
def load_pdf_files(data_path):
    loader = DirectoryLoader(
        data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()
    return documents

In [15]:
extracted_data = load_pdf_files('data')

In [16]:
extracted_data

[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-11-03T16:54:47+05:30', 'author': 'BWU', 'moddate': '2025-11-03T16:54:47+05:30', 'source': 'data\\BWU_COURSES.pdf', 'total_pages': 14, 'page': 0, 'page_label': '1'}, page_content='Undergraduate Programmes \nSchool of Engineering \n1. Computer Science & Engineering \nThe Bachelor of Technology in Computer Science & Engineering requires 50% marks or equivalent \ngrade in the 10+2 examination with a minimum of 45% marks or equivalent grade in any three \nsubjects taken together including Physics and Mathematics (mandatory) and either Chemistry, \nComputer Science, Electronics, Information Technology, Biology, Informatics Practice, Biotechnology, \nTechnical Vocational Subjects, Agriculture, Engineering Graphics, Business Studies, or \nEntrepreneurship, with individual pass marks in both theory and practical, along with pass marks in \nEnglish. This programme spans 4 years and in

In [17]:
print(len(extracted_data))

14


In [18]:
from typing import List
from langchain.schema import Document

def filter_to_minimul_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of document objects, return a new list of document
    objects containing only 'source' in metadata and the original page content.
    """
    minimul_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimul_docs.append(
            Document(
                metadata={"source": src},
                page_content=doc.page_content
            )
        )
    return minimul_docs

In [19]:
minimul_docs = filter_to_minimul_docs(extracted_data)
minimul_docs

[Document(metadata={'source': 'data\\BWU_COURSES.pdf'}, page_content='Undergraduate Programmes \nSchool of Engineering \n1. Computer Science & Engineering \nThe Bachelor of Technology in Computer Science & Engineering requires 50% marks or equivalent \ngrade in the 10+2 examination with a minimum of 45% marks or equivalent grade in any three \nsubjects taken together including Physics and Mathematics (mandatory) and either Chemistry, \nComputer Science, Electronics, Information Technology, Biology, Informatics Practice, Biotechnology, \nTechnical Vocational Subjects, Agriculture, Engineering Graphics, Business Studies, or \nEntrepreneurship, with individual pass marks in both theory and practical, along with pass marks in \nEnglish. This programme spans 4 years and incurs fees of 5,49,600. \nThe Bachelor of Technology in Computer Science & Engineering (Lateral Entry) necessitates 45% \nmarks or equivalent grade in a relevant Diploma, BSc, or BCA. It extends over 3 years with fees \namo

In [20]:
def text_split(minimul_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600,
        chunk_overlap=120,
        length_function=len,
        separators=["\n\n", "\n", ". ", " "],
        keep_separator=True,
    )
    texts_chunk = text_splitter.split_documents(minimul_docs)
    return texts_chunk

In [21]:
texts_chunk = text_split(minimul_docs)
print(f"number of chunks: {len(texts_chunk)}")

number of chunks: 93


In [22]:
texts_chunk

[Document(metadata={'source': 'data\\BWU_COURSES.pdf'}, page_content='Undergraduate Programmes \nSchool of Engineering \n1. Computer Science & Engineering \nThe Bachelor of Technology in Computer Science & Engineering requires 50% marks or equivalent \ngrade in the 10+2 examination with a minimum of 45% marks or equivalent grade in any three \nsubjects taken together including Physics and Mathematics (mandatory) and either Chemistry, \nComputer Science, Electronics, Information Technology, Biology, Informatics Practice, Biotechnology, \nTechnical Vocational Subjects, Agriculture, Engineering Graphics, Business Studies, or'),
 Document(metadata={'source': 'data\\BWU_COURSES.pdf'}, page_content='Technical Vocational Subjects, Agriculture, Engineering Graphics, Business Studies, or \nEntrepreneurship, with individual pass marks in both theory and practical, along with pass marks in \nEnglish. This programme spans 4 years and incurs fees of 5,49,600. \nThe Bachelor of Technology in Compute

In [23]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "BAAI/bge-m3"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

In [24]:
embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [25]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-m3', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [26]:
vector = embedding.embed_query("Hello world")
vector

[-0.040139514952898026,
 0.032212261110544205,
 -0.02821594849228859,
 0.02462964877486229,
 -0.033103734254837036,
 -0.04166659712791443,
 -0.056860607117414474,
 -0.0427829883992672,
 0.008332453668117523,
 -0.0011965357698500156,
 -0.008856850676238537,
 0.013744605705142021,
 0.028526725247502327,
 -0.013167517259716988,
 0.022318441420793533,
 2.5554571038810536e-05,
 0.02634189836680889,
 -0.027663704007864,
 -0.03086930140852928,
 -0.034283410757780075,
 -0.044395167380571365,
 -0.010097202844917774,
 0.018569113686680794,
 -0.0258745476603508,
 0.012294434942305088,
 0.05799945071339607,
 -0.03533744439482689,
 0.00010739014396676794,
 -0.0002423890691716224,
 -0.05514145269989967,
 0.04237139970064163,
 0.0777888223528862,
 -0.010337808169424534,
 -0.044965848326683044,
 -0.023880477994680405,
 -0.03335679695010185,
 -0.0037363225128501654,
 -0.00680772727355361,
 -0.06041029840707779,
 0.01858074590563774,
 0.006723856553435326,
 0.01908908039331436,
 0.01814052276313305,
 -0

In [27]:
print(len(vector))

1024


In [28]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [29]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [30]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [31]:
pc

<pinecone.pinecone.Pinecone at 0x2bc06a0b7a0>

In [32]:
from pinecone import ServerlessSpec

index_name = "bwu-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1',
        )
    )

index = pc.Index(index_name)

In [33]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [34]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [35]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [36]:
retrieved_docs = retriever.invoke("What is the course fees for btech cse?")
retrieved_docs

[Document(id='687a64e3-d3ba-462c-bda8-e3d4fdab52af', metadata={'source': 'data\\BWU_COURSES.pdf'}, page_content='either Chemistry, Computer Science, Electronics, Information Technology, Biology, Informatics \nPractice, Biotechnology, Technical Vocational Subjects, Agriculture, Engineering Graphics, Business \nStudies, or Entrepreneurship, with individual pass marks in both theory and practical, along with pass \nmarks in English. This 4-year course has fees of 5,49,600. \nThe Bachelor of Technology in Computer Science & Engineering – Cyber Security (Lateral Entry) \nrequires 45% marks or equivalent grade in a relevant Diploma, BSc, or BCA. It spans 3 years with fees \nof 2,41,200.'),
 Document(id='77ebe930-574e-4026-87d8-17ab092b71f2', metadata={'source': 'data\\BWU_COURSES.pdf'}, page_content='equivalent grade in B.E., BTech, AMIE, or AMIETE in CSE or IT, or MCA, or MSc in CS or IT; a 5% \nrelaxation in marks applies for all reserved category candidates. This 2-year course has fees of

In [37]:
from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash"
)

In [38]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [39]:
system_prompt = (
    "You are a helpful assistant for question answering tasks."
    "Use the following pieces of retrieved context to answer the " \
    "question." \
    "If you don't know the answer, just say that you don't know." \
    "Use three to four sentences maximum and keep the answer concise." \
    "\n\n" \
    "{context}"
)

In [40]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

In [41]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [46]:
response = rag_chain.invoke({"input": "What are the total fees for"
" BTech CSE CS/DS course?"})

print(response['answer'])

I am sorry, but the provided text does not specify the total fees for a Bachelor of Technology (BTech) in Computer Science & Engineering with a specialization in CS/DS. The document mentions fees for a general 4-year BTech course and Master of Technology (MTech) courses in Data Science, but not the specific BTech CSE CS/DS program.
