In [1]:
print(44)

44


In [2]:
from pypdf import PdfReader
import os

print(os.getenv("GEMINI_API_KEY"))


AIzaSyDZ1EVZ3PlwqWLmwQX-6WwyzgiOgifRZ0A


In [3]:
DATA_DIR = "data"

def load_pdf_text(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        extracted = page.extract_text()
        if extracted:
            text += extracted + "\n"
    return text

documents = []

for file in os.listdir(DATA_DIR):
    if file.endswith(".pdf"):
        text = load_pdf_text(os.path.join(DATA_DIR, file))
        documents.append({
            "source": file,
            "text": text
        })

print("Total PDFs loaded:", len(documents))
for doc in documents:
    print(doc["source"], "-> characters:", len(doc["text"]))


Total PDFs loaded: 16
Academic_Regulations.pdf -> characters: 1573
BTech_Academic_Regulations.pdf -> characters: 1131
BTech_Branch_Wise_Syllabus.pdf -> characters: 692
BTech_Placement_Policy.pdf -> characters: 433
BTech_Project_and_Internship_Guidelines.pdf -> characters: 404
BTech_Student_Handbook.pdf -> characters: 365
Examination_Guidelines.pdf -> characters: 1058
gprec1.pdf -> characters: 59903
Internal_and_External_Evaluation.pdf -> characters: 1446
jnyuh.pdf -> characters: 57918
Placement_Policy.pdf -> characters: 1038
pune1.pdf -> characters: 43680
syllabus.pdf -> characters: 452032
TCS NQT IMP CONCEPTS (1).pdf -> characters: 601
ugc.pdf -> characters: 175959
vels.pdf -> characters: 80555


In [4]:
def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

chunks = []

for doc in documents:
    small_chunks = chunk_text(doc["text"])
    for ch in small_chunks:
        chunks.append({
            "source": doc["source"],
            "text": ch
        })

print("Total chunks created:", len(chunks))
print("\nSample chunk:\n")
print(chunks[0]["text"])


Total chunks created: 2204

Sample chunk:

Academic Regulations â€“ Undergraduate Programs 
1. Introduction 
These academic regulations govern all undergraduate students enrolled in the institution. The 
objective is to maintain academic discipline, ensure fair evaluation, and promote holistic learning. 
 
2. Academic Year & Semester System 
â€¢ The academic year is divided into two semesters: 
o Odd Semester (Julyâ€“November) 
o Even Semester (Januaryâ€“May) 
â€¢ Each semester consists of: 
o Minimum 90 instructional days 
o Internal assessments 


In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np


In [6]:
# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Take only text from chunks
texts = [chunk["text"] for chunk in chunks]

# Convert text to vectors
embeddings = embedder.encode(texts)

print("Embeddings created for chunks:", len(embeddings))
print("Vector size:", embeddings.shape[1])


Embeddings created for chunks: 2204
Vector size: 384


In [7]:
import faiss

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to FAISS
index.add(np.array(embeddings))

print("Total chunks stored in FAISS:", index.ntotal)


Total chunks stored in FAISS: 2204


In [8]:
# Save FAISS index and chunks for Streamlit
faiss.write_index(index, "faiss.index")
np.save("chunks.npy", chunks)

print("FAISS index and chunks saved successfully")


FAISS index and chunks saved successfully


In [9]:
# Student question
query = "What is the minimum attendance required?"

# Convert question to vector
query_embedding = embedder.encode([query])

# Search FAISS for top 2 relevant chunks
D, I = index.search(np.array(query_embedding), k=2)

print("Retrieved chunks:\n")

for idx in I[0]:
    print("Source:", chunks[idx]["source"])
    print(chunks[idx]["text"])
    print("-" * 50)


Retrieved chunks:

Source: gprec1.pdf
on of subjects for Honors program offered in offline at the 
institution.  
15. Attendance Requirements:  
  
i) A student shall be eligible to appear for the end examinations if he/she acquires a minimum of 
40% attendance in each subject and 75% of attendance in aggregate of all the subjects.  
ii) Condonation of shortage of attendance in aggregate up to 10% (65% and above and below 75%) 
in each semester may be granted by the Principal.  
iii) Shortage of Attendance below 65% in aggregate sha
--------------------------------------------------
Source: vels.pdf
ear, for any Programme 
(Degree or Diploma), shall ordinarily be required to have a minimum cumulative 
attendance of 75% of the total lectures and practicals engaged during that Semester / 
Term / Year. A shorta ge of attendance of up to 10% alone can be condoned on an 
application filed by a candidate on medical grounds. 
 In the case of Nursing degree programme, a candidate must have min

In [10]:
import os
from google import genai

In [11]:
# Create Gemini client using API key from environment variable
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

# Combine retrieved chunks into context
context = "\n\n".join([chunks[idx]["text"] for idx in I[0]])

prompt = f"""
You are a college assistant chatbot.

Answer the question ONLY using the context below.
If the answer is not present, say "Information not found in college documents."

Context:
{context}

Question:
{query}

Answer:
"""


In [12]:
from google import genai
import os

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

for m in client.models.list():
    print(m.name)


models/embedding-gecko-001
models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-exp-1206
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image-preview
models/gemini-2.5-flash-image
models/gemini-2.5-flash-preview-09-2025
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-flash-preview
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robotics-er-1.5-preview
models/g

In [13]:
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=prompt
)

print(response.text)


The minimum attendance required for a student to be eligible to appear for end examinations is 40% in each subject and 75% in aggregate of all subjects. Ordinarily, a minimum cumulative attendance of 75% of the total lectures and practicals engaged during that Semester / Term / Year is required. For the Nursing degree programme, a candidate must have a minimum of 80% attendance in theory and practical in each course/subject for appearing in the examination.


In [14]:
print("ðŸŽ“ College Assistant Chatbot")
print("Type 'exit' to stop\n")

while True:
    query = input("You: ")
    
    if query.lower() == "exit":
        print("Chatbot: Goodbye ðŸ‘‹")
        break

    # Convert query to vector
    query_embedding = embedder.encode([query])

    # Retrieve top 2 chunks
    D, I = index.search(np.array(query_embedding), k=2)

    # Build context
    context = "\n\n".join([chunks[idx]["text"] for idx in I[0]])

    prompt = f"""
    You are a college assistant chatbot.

    Answer the question ONLY using the context below.
    If the answer is not present, say "Information not found in college documents."

    Context:
    {context}

    Question:
    {query}

    Answer:
    """

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )

    print("\nChatbot:", response.text)
    print("-" * 60)


ðŸŽ“ College Assistant Chatbot
Type 'exit' to stop



You:  exit


Chatbot: Goodbye ðŸ‘‹
