In [None]:

!pip install pdfplumber transformers sentence-transformers pytesseract faiss-cpu opencv-python-headless Pillow

import pdfplumber
import pytesseract
from PIL import Image
import cv2
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import faiss
from sentence_transformers import util

# Initialize necessary models
nlp_model = SentenceTransformer('all-MiniLM-L6-v2')
cross_encoder_model = BertForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
cross_encoder_tokenizer = BertTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
index = faiss.IndexFlatL2(384)  # FAISS index for vector search

# Mock chat memory (to replace Redis)
chat_memory = {}

def store_chat_memory(session_id, question, answer):
    chat_memory[session_id] = {"question": question, "answer": answer}

def retrieve_chat_memory(session_id):
    return chat_memory.get(session_id)

# 1. PDF Text Extraction using pdfplumber
def extract_text_from_pdf(pdf_path, is_scanned=False, lang='eng'):
    if not is_scanned:
        # Use pdfplumber for extracting text from digital PDFs
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        return text
    else:
        # OCR extraction for scanned PDFs
        img = cv2.imread(pdf_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        return pytesseract.image_to_string(gray, lang=lang)

# 2. Query decomposition
def decompose_query(query):
    sub_queries = query.split(' and ')  # Simple example, can be enhanced with NLP models
    return sub_queries

# 3. Optimized chunking for large documents
def chunk_text(text, max_tokens=200):
    chunks = []
    chunk = ""
    for sentence in text.split('. '):
        if len(chunk.split()) + len(sentence.split()) < max_tokens:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "
    chunks.append(chunk.strip())
    return chunks


def hybrid_search(query):

    keyword_results = [{'text': 'This is a mocked keyword search result.'}]

    # Semantic Search using SentenceTransformer
    query_embedding = nlp_model.encode(query)
    sentence_embeddings = nlp_model.encode([result['text'] for result in keyword_results])
    semantic_results = util.semantic_search(query_embedding, sentence_embeddings)

    return keyword_results, semantic_results

# 5. FAISS vector search
def add_to_faiss_index(embeddings):
    index.add(embeddings)

def search_faiss(query_embedding, top_k=10):
    distances, indices = index.search(np.array([query_embedding]), top_k)
    return distances, indices

# 6. Reranking using Cross-Encoder
def rerank(query, documents):
    reranked_results = []
    for doc in documents:
        inputs = cross_encoder_tokenizer(query, doc, return_tensors="pt", padding=True, truncation=True)
        scores = cross_encoder_model(**inputs).logits
        reranked_results.append((doc, scores.item()))

    reranked_results.sort(key=lambda x: x[1], reverse=True)
    return reranked_results

# RAG pipeline
def rag_pipeline(pdf_path, is_scanned=False, lang='eng', session_id=None, query=None):
    # 1. Extract text
    text = extract_text_from_pdf(pdf_path, is_scanned, lang)

    # 2. Store extracted text in chunks
    chunks = chunk_text(text)

    # 3. Add chunk embeddings to FAISS
    embeddings = nlp_model.encode(chunks)
    add_to_faiss_index(embeddings)

    # 4. Perform Hybrid Search (Keyword + Semantic)
    keyword_results, semantic_results = hybrid_search(query)

    # 5. Perform FAISS search for the query embedding
    query_embedding = nlp_model.encode(query)
    distances, indices = search_faiss(query_embedding)

    # 6. Rerank the results using the cross-encoder model
    reranked = rerank(query, chunks)

    # 7. Store the conversation context in chat memory
    if session_id:
        store_chat_memory(session_id, query, reranked[0][0])

    return {
        "keyword_results": keyword_results,
        "semantic_results": semantic_results,
        "faiss_results": indices,
        "reranked_results": reranked
    }





In [None]:
from google.colab import files
uploaded = files.upload()



In [None]:
pdf_path = next(iter(uploaded.keys()))  # Get the first uploaded file
session_id = "1"
query = "who is Jerry?"

# Run the RAG pipeline
results = rag_pipeline(pdf_path, is_scanned=False, lang='eng', session_id=session_id, query=query)

# Display the results
print(results)