# Import Libraries

In [39]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import PyPDF2
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import logging
import os

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Setup Logging

In [40]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Define Text Processing Utilities

In [41]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text() + '\n'
        return text
    except Exception as e:
        logging.error(f"Failed to extract text from {pdf_path}: {str(e)}")
        return ''

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    sentences = text.split('.')
    cleaned_sentences = []
    for sentence in sentences:
        words = sentence.lower().split()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        cleaned_sentences.append(" ".join(words))
    return cleaned_sentences


# Define the RAG Class

In [42]:
class RAG:
    def __init__(self, documents, top_k=2, max_length=100):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.doc_vectors = self.vectorizer.fit_transform(documents)
        self.generator = pipeline('text-generation', model='gpt2')
        self.summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
        self.top_k = top_k
        self.max_length = max_length
        self.cache = {}

    def retrieve_documents(self, query):
        if query in self.cache:
            logging.info(f"Retrieving results from cache for query: {query}")
            return self.cache[query]

        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.doc_vectors).flatten()
        top_indices = similarities.argsort()[-self.top_k:][::-1]
        retrieved_docs = [self.documents[i] for i in top_indices]
        scores = similarities[top_indices]

        self.cache[query] = (retrieved_docs, scores)
        return retrieved_docs, scores

    def generate_answer(self, query):
        relevant_docs, scores = self.retrieve_documents(query)
        logging.info("Retrieved documents:")
        for i, (doc, score) in enumerate(zip(relevant_docs, scores)):
            logging.info(f"Doc {i+1} - Score: {score:.4f} - Text: {doc[:100]}...")

        context = " ".join(relevant_docs)
        summarized_context = self.summarizer(context, max_length=150, min_length=30, do_sample=False)[0]['summary_text']

        logging.info(f"Summarized context: {summarized_context}")

        prompt = f"Context: {summarized_context}\n\nQuestion: {query}\n\nAnswer:"
        response = self.generator(prompt, max_length=self.max_length, num_return_sequences=1)

        return response[0]['generated_text']


# Main Function for PDF Processing and User Interaction

In [None]:
def main():
    pdf_paths = [
        'data/data.pdf',  # Update this to your PDF file paths
    ]

    all_documents = []
    for pdf_path in pdf_paths:
        if not os.path.exists(pdf_path):
            logging.error(f"File not found: {pdf_path}")
            continue
        logging.info(f"Extracting text from {pdf_path}...")
        pdf_text = extract_text_from_pdf(pdf_path)
        if pdf_text:
            logging.info(f"Preprocessing text from {pdf_path}...")
            documents = preprocess_text(pdf_text)
            all_documents.extend(documents)

    if not all_documents:
        logging.error("No valid documents extracted from the PDFs. Exiting.")
        return

    logging.info("Initializing RAG model...")
    rag = RAG(all_documents)

    print("\nRAG model is ready. You can now ask questions about the PDF content.")
    print("Type 'quit' to exit the application.\n")

    while True:
        query = input("Enter your question: ")
        if query.lower() == 'quit':
            break

        answer = rag.generate_answer(query)
        print("\nAnswer:")
        wrapped_answer = textwrap.fill(answer, width=80)
        print(wrapped_answer)
        print()

    print("Thank you for using the RAG CLI Application!")

if __name__ == "__main__":
    main()



RAG model is ready. You can now ask questions about the PDF content.
Type 'quit' to exit the application.

