In [None]:
!pip install langchain_community pypdf
!pip install sentence_transformers

**Data loading**

Convert raw data (PDF, text, etc) into structured format

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader

In [None]:
DOCS_PATH = '/content/drive/MyDrive/RAG/docs'

In [None]:
def load_pdfs(folder_path):
  documents = []

  for file in os.listdir(folder_path):
    if file.endswith('.pdf'):
      loader = PyPDFLoader(os.path.join(folder_path, file))
      docs = loader.load()

      for d in docs:
        d.metadata['source'] = file

      documents.extend(docs)
  return documents

In [None]:
documents = load_pdfs(DOCS_PATH)
print(f"Loaded {len(documents)} pages")

**Docs Cleaning**

Remove noise + normalize text for better retrieval

In [None]:
import re
def clean_documents(text):
  text = re.sub(r'\n+',' ',text)
  text = re.sub(r'\s+',' ',text)
  return text.strip()

**PII (Personally Identifiable Information) Masking**

Protect sensitive info (names, phone numbers)

In [None]:
def mask_pii(text):
  text = re.sub(r'\b\d{10}\b','[PHONE]',text)
  return text

In [None]:
processed_docs = []
for doc in documents:
  text = clean_documents(doc.page_content)
  text = mask_pii(text)
  processed_docs.append({
      "text":text,
      "source":doc.metadata['source']
  })

**Chunking**

Split into smaller pieces -> Improve retrieval

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=80
)

In [None]:
chunks = []
for doc in processed_docs:
  split_texts = splitter.split_text(doc['text'])
  for chunk in split_texts:
    chunks.append({
        "text":chunk,
        "source":doc['source']
    })

In [None]:
chunks[0]

**Embeddings**

Convert text -> vectors for similarity Search

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [ch['text'] for ch in chunks]
embeddings = embed_model.encode(texts)

**Dense Retrieval (FIASS)**

Uses embeddings (vectors) to capture semantic meaning, not just exact words.

In [None]:
!pip install faiss-cpu

In [None]:
import faiss
import numpy as np

In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

**Sparse Retrieval**

Uses keyword matching (TF-IDF, BM25). Based on exact word overlap

In [None]:
!pip install rank_bm25

In [None]:
from rank_bm25 import BM25Okapi

In [None]:
tokenized_chunks = [c["text"].split() for c in chunks]
bm25 = BM25Okapi(tokenized_chunks)

**Hybrid Retrieval**

Combine Keyword (Sparse) + Semantic (Dense) Search

In [None]:
def hybrid_search(query, k = 5):
  query_embedding = embed_model.encode([query])

  # Dense
  D, I  = index.search(np.array(query_embedding), k)
  dense_results = [chunks[i] for i in I[0]]

  # Sparse
  bm25_scores = bm25.get_scores(query.split())
  top_sparse_idx = np.argsort(bm25_scores)[-k:]
  sparse_results = [chunks[i] for i in top_sparse_idx]

  # Combine
  combined = dense_results + sparse_results
  return combined

**Query Reformulation**

Improve Bad queries -> Better retrieval

In [None]:
def rewrite_query(query):
  return f"Explain clearly: {query}"

**Reranking**

Re-score retrieved results for better accuracy

In [None]:
from sentence_transformers import CrossEncoder

In [None]:
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank(query, results):
  pairs = [(query, r['text']) for r in results]
  scores = reranker.predict(pairs)

  ranked = sorted(zip(scores, results), reverse=True)
  return [r for _, r in ranked]

**Caching**

In [None]:
cache = {}

In [None]:
def cached_search(query):
  if query in cache:
    return cache[query]

  results = hybrid_search(query)
  cache[query] = results
  return results

**Secure Retrieval**

Restrice access based on user or context.

In [None]:
def secure_filter(results, allowed_sources):
  return [r for r in results if r['source'] in allowed_sources]

**Multi-Hop Retrieval**

Multiple retrieval steps

In [None]:
def multi_hop(query):
  step1 = hybrid_search(query)

  refined_query = f"Based of above, explain deeper: {query}"

  step2 = hybrid_search(refined_query)

  return step1 + step2

**Prompt + LLM**

Generate answer using retrieved context.

In [None]:
!pip install langchain-openai

In [None]:
import os
os.environ['OPENAI_API_KEY'] = 'Your-Openai-API-Key'

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name='gpt-4o-mini',
                 temperature = 0)

In [None]:
def generate_answer(query, context):
  prompt = f'''
  Answer only from context.
  If not found, say "I dont know".

  Context:
  {context}

  Questions:
  {query}
  '''
  response = llm.invoke(prompt)

  return response

**Hallucination Control**

Force model to say "I don't know"

In [None]:
### Already added in prompt as "ONLY from context"

**Evaluation**

Check if correct chunk retrieved

In [None]:
def evaluation(answer, ground_truth):
  return ground_truth.lower() in answer.lower()

**Latency Tracking**

Measure performance based on retrieval speed

In [None]:
import time

In [None]:
start = time.time()
results = hybrid_search("What is ResNet?")
print(f"Latency: {time.time() - start}")

**FeedBack Loop**

Improves system over time

In [None]:
feedback = []

def store_feedback(query, answer, correct):
  feedback.append({
      "query":query,
      "answer":answer,
      "correct":correct
  })

**Bias Check**

Detect unfair outputs

In [None]:
def bias_check(answer):
  if "only" in answer and "better" in answer:
    return "Check bias"

### Final Pipeline

In [None]:
def full_pipeline(query):
  query = rewrite_query(query)

  results = cached_search(query)

  results = secure_filter(results, ["ResNet.pdf", "Faster R-CNN.pdf","Vision Transformer (ViT).pdf"])

  results = rerank(query, results)

  context = "\n\n".join(r["text"] for r in results[:3])

  answer = generate_answer(query, context)

  return answer

In [None]:
full_pipeline("What problem does ResNet solve?")

In [None]:
full_pipeline("How does Faster R-CNN improve object detection?")

In [None]:
full_pipeline("Difference between CNN and Vision Transformer?")