In [1]:
import os
os.listdir(r"C:\Users\ASUS\OneDrive\Desktop\RAG_Chatbot\data\sample document")

['Doc1.txt', 'Doc2.txt']

In [2]:
#Import Libraries
import os
import numpy as np
import faiss
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from pypdf import PdfReader


In [3]:
#Load Documents
DATA_PATH = r"C:\Users\ASUS\OneDrive\Desktop\RAG_Chatbot\data\sample document"

def load_documents(folder_path):
    docs = []
    for file in os.listdir(folder_path):
        full_path = os.path.join(folder_path, file)

        if file.lower().endswith(".txt"):
            with open(full_path, "r", encoding="utf-8") as f:
                text = f.read()
            docs.append({"text": text, "source": file})

        elif file.lower().endswith(".pdf"):
            reader = PdfReader(full_path)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            docs.append({"text": text, "source": file})

    return docs

docs = load_documents(DATA_PATH)
print("Total documents loaded:", len(docs))
docs

Total documents loaded: 2


[{'text': 'Company Refund & Return Policy\n\nOur company allows customers to return products within 30 days of purchase. To be eligible for a return, the product must be unused, in its original packaging, and accompanied by the original receipt.\n\nRefunds are processed within 7–10 business days after the returned product is received and inspected. The refund will be credited back to the original payment method.\n\nDigital products, downloadable software, and perishable goods are non-refundable.\n\nIf a customer receives a damaged product, they must report it within 48 hours of delivery. The company will arrange a replacement or full refund after verification.\n\nFor any disputes related to refunds, customers can contact the support team via email or phone during business hours.\n',
  'source': 'Doc1.txt'},
 {'text': 'Employee Leave Policy\n\nEmployees are entitled to 24 paid leaves per year. This includes casual leave, sick leave, and earned leave. Unused earned leave can be carried f

In [4]:
#Chunk the Documents
def chunk_text(text, chunk_size=250, overlap=30):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks


all_chunks = []
for d in docs:
    chunks = chunk_text(d["text"])
    for c in chunks:
        all_chunks.append({
            "chunk": c,
            "source": d["source"]
        })

print("Total chunks created:", len(all_chunks))


Total chunks created: 2


In [5]:
#Create Embeddings
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = [c["chunk"] for c in all_chunks]
embeddings = embed_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

print("Embedding shape:", embeddings.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding shape: (2, 384)


In [6]:
#Create FAISS Index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index size:", index.ntotal)

FAISS index size: 2


In [7]:
#Retrieval Function
def retrieve(query, k=2):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(q_emb, k)

    results = []
    for idx in indices[0]:
        results.append(all_chunks[idx])

    return results

In [8]:
#Test Retrieval
retrieve("How many leaves do employees get?")

[{'chunk': 'Employee Leave Policy Employees are entitled to 24 paid leaves per year. This includes casual leave, sick leave, and earned leave. Unused earned leave can be carried forward to the next financial year up to a maximum of 30 days. Sick leave must be supported with a medical certificate if it exceeds three consecutive days. Employees must apply for leave at least two days in advance through the HR portal. Emergency leave must be informed to the reporting manager as soon as possible. Maternity leave of 26 weeks is provided to eligible female employees as per government regulations. Paternity leave of 10 working days is available. Leave without pay may be granted at management discretion in special circumstances.',
  'source': 'Doc2.txt'},
 {'chunk': 'Company Refund & Return Policy Our company allows customers to return products within 30 days of purchase. To be eligible for a return, the product must be unused, in its original packaging, and accompanied by the original receipt.

In [9]:
#Load LLM (GPT-2 Demo)
llm_model = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(llm_model)
model = AutoModelForCausalLM.from_pretrained(llm_model)

tokenizer.pad_token = tokenizer.eos_token

In [10]:
#FINAL RAG Answer Function
def answer_question(query):
    retrieved_chunks = retrieve(query, k=2)
    context = "\n\n".join([c["chunk"] for c in retrieved_chunks])

    prompt = f"""
You are a helpful assistant. Use ONLY the context below to answer the question.
If the answer is not in the context, say "I don't know".

Context:
{context}

Question: {query}
Answer:
"""

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=80,
        num_beams=2,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, retrieved_chunks


In [11]:
#Final RAG Test
query = "What is the refund processing time?"
answer, ctx = answer_question(query)

print("ANSWER:\n", answer)
print("\nCONTEXT USED:")
for c in ctx:
    print("-", c["source"])


ANSWER:
 
You are a helpful assistant. Use ONLY the context below to answer the question.
If the answer is not in the context, say "I don't know".

Context:
Company Refund & Return Policy Our company allows customers to return products within 30 days of purchase. To be eligible for a return, the product must be unused, in its original packaging, and accompanied by the original receipt. Refunds are processed within 7–10 business days after the returned product is received and inspected. The refund will be credited back to the original payment method. Digital products, downloadable software, and perishable goods are non-refundable. If a customer receives a damaged product, they must report it within 48 hours of delivery. The company will arrange a replacement or full refund after verification. For any disputes related to refunds, customers can contact the support team via email or phone during business hours.

Employee Leave Policy Employees are entitled to 24 paid leaves per year. This 

In [12]:
import faiss
import json

# Save FAISS index
faiss.write_index(index, "faiss_index.bin")

# Save chunks + metadata
with open("chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print("Index and chunks saved!")


Index and chunks saved!


In [13]:
import os
os.getcwd()

'C:\\Users\\ASUS'

In [14]:
os.listdir()

['.anaconda',
 '.cache',
 '.conda',
 '.condarc',
 '.continuum',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.matplotlib',
 '.streamlit',
 '.vscode',
 '2316529_MA336_Machine_Learning_Project.ipynb',
 '3D Objects',
 'anaconda_projects',
 'AppData',
 'Application Data',
 'bank-additional-full.csv',
 'Bharath_NN.ipynb',
 'cardekho_data.csv',
 'ce889_dataCollection.csv',
 'ce889_dataCollection_1.csv',
 'ce889_dataCollection_2.csv',
 'ce889_dataCollection_3.csv',
 'chunks.json',
 'Cleaned_Tagged_Dataset.csv',
 'cleaned_warranty_data.csv',
 'Company Refund & Return Policy.txt',
 'Contacts',
 'Cookies',
 'credit card example 2.ipynb',
 'Credit Card ML project 1.ipynb',
 'Credit_Card.csv',
 'DA -Task 2.csv',
 'DA_Task_2_Cleaned_Tagged.csv',
 'Disseration Project.ipynb',
 'Documents',
 'Downloads',
 'Employee Leave Policy.txt',
 'faiss_index.bin',
 'Favorites',
 'Finial_Neural_Network_Assignmnet_Bharath.ipynb',
 'IMDB.csv',
 'IntelGraphicsProfiles',
 'Investment.csv',
 'Links',
 'Local Se