<a href="https://colab.research.google.com/github/astute-ai-gen-ai/Rag/blob/main/test_roberta_pdfs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk sentence-transformers faiss-cpu transformers torch pymupdf


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting pymupdf
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-c

In [None]:

# Download NLTK data
import nltk
nltk.download('punkt')

# Import necessary libraries
import fitz  # PyMuPDF
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import torch
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
from collections import deque

# Step 1: Reading PDF Content (Parallel)
def fetch_pdf_content(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text, pdf_path

# Step 2: Data Processing
def split_into_chunks(text, chunk_size=512):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []

    current_length = 0
    for sentence in sentences:
        if current_length + len(sentence) > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += len(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Step 3: Text Embedding (Batch Processing)
model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_text(chunks):
    embeddings = model.encode(chunks, batch_size=32, show_progress_bar=True)
    return embeddings

# Step 4: Vector Storage with Faiss
def store_embeddings(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 5: Integration
def retrieve_relevant_chunks(query, index, chunks, urls, top_k=3):
    query_embedding = model.encode([query])
    _, indices = index.search(query_embedding, top_k)
    relevant_chunks = [(chunks[idx], urls[idx]) for idx in indices[0]]
    return relevant_chunks

# Step 6: Question-Answering
qa_pipeline = pipeline('question-answering', model='deepset/roberta-large-squad2')

def answer_question(query, context):
    result = qa_pipeline(question=query, context=context)
    return result['answer']

# Chat History Management
class ChatSession:
    def __init__(self):
        self.history = deque(maxlen=10)  # Keep the last 10 interactions for context

    def add_interaction(self, question, answer, sources):
        self.history.append({'question': question, 'answer': answer, 'sources': sources})

    def get_context(self):
        context = ' '.join(interaction['answer'] for interaction in self.history)
        return context

# Full Pipeline with Multiple PDFs
def rag_pipeline(pdf_paths, query, chat_session):
    # Fetch and process content from multiple PDFs
    with ThreadPoolExecutor() as executor:
        contents = list(executor.map(fetch_pdf_content, pdf_paths))

    all_chunks = []
    all_urls = []
    for content, pdf_path in contents:
        chunks = split_into_chunks(content)
        all_chunks.extend(chunks)
        all_urls.extend([pdf_path] * len(chunks))

    embeddings = embed_text(all_chunks)
    index = store_embeddings(embeddings)
    relevant_chunks = retrieve_relevant_chunks(query, index, all_chunks, all_urls)

    # Combine previous context with new relevant chunks
    previous_context = chat_session.get_context()
    new_context = ' '.join([chunk for chunk, _ in relevant_chunks])
    full_context = previous_context + ' ' + new_context

    answer = answer_question(query, full_context)
    sources = set(url for _, url in relevant_chunks)

    # Store the interaction in chat history
    chat_session.add_interaction(query, answer, sources)

    return answer, sources




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

# Example usage
pdf_paths = [
    "/content/drive/MyDrive/blogs2.0/(1) The ROI of AI - ASTUTE.pdf",
    "/content/drive/MyDrive/blogs2.0/Astute Digital Integration.pdf",
    "/content/drive/MyDrive/blogs2.0/Astute Machine Learning.pdf",
    "/content/drive/MyDrive/blogs2.0/Astute SaaS and AI.pdf"
]

In [None]:
query = "What is the reported reduction in operational costs when AI is integrated into businesses?"

# Create a chat session
chat_session = ChatSession()

# Get the answer and sources
answer, sources = rag_pipeline(pdf_paths, query, chat_session)
print(f"Answer: {answer}")
print(f"Sources: {sources}")

# Follow-up question
follow_up_query = "What is the reported reduction in operational costs when AI is integrated into businesses?"
answer, sources = rag_pipeline(pdf_paths, follow_up_query, chat_session)
print(f"Answer: {answer}")
print(f"Sources: {sources}")



Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Answer: reduces the cost and avoides continuous 
human interventions
Sources: {'/content/drive/MyDrive/blogs2.0/(1) The ROI of AI - ASTUTE.pdf'}
