In [None]:
import os
from PyPDF2 import PdfReader
from datetime import datetime
import openai
from pinecone import Pinecone, ServerlessSpec
import json

# Load sensitive keys from environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize Pinecone client
pc = Pinecone(
    api_key=PINECONE_API_KEY,
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)
index_name = "project-management-rag"

# Create Pinecone index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
index = pc.Index(index_name)

# Set OpenAI API Key
openai.api_key = OPENAI_API_KEY

# Fine-tuned model ID
FINE_TUNED_MODEL = "ft:gpt-4o-mini-2024-07-18:ct-main-test::AZBbHLRk"

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file, page by page."""
    try:
        reader = PdfReader(pdf_path)
        return [{"page_number": page_number + 1, "text": page.extract_text()} for page_number, page in enumerate(reader.pages)]
    except Exception as e:
        raise RuntimeError(f"Failed to extract text from PDF: {e}")

def generate_embedding(text):
    """Generate embeddings for the given text using OpenAI's API."""
    try:
        response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
        return response['data'][0]['embedding']
    except Exception as e:
        raise RuntimeError(f"Failed to generate embedding: {e}")

def index_text_segments(text_segments):
    """Index text segments into Pinecone."""
    try:
        for segment in text_segments:
            embedding = generate_embedding(segment['text'])
            index.upsert([
                (
                    f"page-{segment['page_number']}",
                    embedding,
                    {"text": segment['text'], "page_number": segment['page_number']}
                )
            ])
        print("Text segments indexed successfully!")
    except Exception as e:
        raise RuntimeError(f"Failed to index text segments: {e}")

def extract_main_topics(text_segments):
    """Extract main topics from text segments using a fine-tuned OpenAI model."""
    try:
        joined_text = " ".join([segment['text'] for segment in text_segments[:5]])
        prompt = f"Extract the main topics from the following text:\n{joined_text}\n\nProvide the topics as a numbered list."
        response = openai.ChatCompletion.create(
            model=FINE_TUNED_MODEL,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": prompt}],
            max_tokens=500
        )
        topics = response.choices[0].message['content'].split("\n")
        return [topic.strip() for topic in topics if topic.strip()]
    except Exception as e:
        raise RuntimeError(f"Failed to extract main topics: {e}")

def retrieve_context_for_topic(topic):
    """Retrieve context for a given topic using Pinecone."""
    try:
        topic_embedding = generate_embedding(topic)
        results = index.query(vector=topic_embedding, top_k=5, include_metadata=True)
        context_chunks = []
        for match in results['matches']:
            context_chunks.append({
                "text": match['metadata']['text'],
                "page_number": match['metadata']['page_number'],
                "confidence_score": match['score']
            })
        return context_chunks
    except Exception as e:
        raise RuntimeError(f"Failed to retrieve context for topic: {e}")

def generate_questions_for_topic(topic, context_chunks):
    """Generate questions for a topic using the context chunks."""
    try:
        best_chunk = max(context_chunks, key=lambda x: x["confidence_score"])
        context_text = best_chunk["text"]
        source_page = best_chunk["page_number"]
        confidence_score = best_chunk["confidence_score"]

        prompt = f"""
        Based on the following topic: "{topic}" and the context: "{context_text}",
        generate 5 multiple-choice questions with:
        - Four options labeled A, B, C, and D
        - The correct answer
        - Detailed explanations for the correct answers

        Provide the output in this format:
        {{
            "questions": [
                {{
                    "question": "<Question text>",
                    "options": ["A) <Option 1>", "B) <Option 2>", "C) <Option 3>", "D) <Option 4>"],
                    "correct_answer": "<Correct Option Label>",
                    "explanation": "<Detailed explanation>"
                }}
            ]
        }}
        """
        response = openai.ChatCompletion.create(
            model=FINE_TUNED_MODEL,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": prompt}],
            max_tokens=1500
        )
        questions = json.loads(response.choices[0].message['content'])["questions"]

        for question in questions:
            question["source_page"] = source_page
            question["confidence_score"] = confidence_score

        return questions
    except Exception as e:
        raise RuntimeError(f"Failed to generate questions for topic: {e}")

def format_output(topics, questions):
    """Format the output data for topics and questions."""
    topics_data = {
        "book_title": "Project Management Professional Guide",
        "total_topics": len(topics),
        "extraction_timestamp": datetime.now().isoformat(),
        "main_topics": topics
    }

    questions_data = {
        "metadata": {
            "generated_at": datetime.now().isoformat(),
            "total_questions": sum(len(q['questions']) for q in questions),
            "book_title": "Project Management Professional Guide",
            "generation_method": "RAG Pipeline",
            "embedding_model": "text-embedding-ada-002",
            "vector_store": "Pinecone"
        },
        "questions": questions
    }

    return topics_data, questions_data

def save_to_json(data, filename):
    """Save data to a JSON file."""
    try:
        with open(filename, "w") as f:
            json.dump(data, f, indent=4)
    except Exception as e:
        raise RuntimeError(f"Failed to save data to JSON file: {e}")

if __name__ == "__main__":
    try:
        pdf_path = "/content/Project.pdf"
        text_segments = extract_text_from_pdf(pdf_path)

        index_text_segments(text_segments)

        main_topics = extract_main_topics(text_segments)

        questions = []
        for topic in main_topics:
            context_chunks = retrieve_context_for_topic(topic)
            question_data = generate_questions_for_topic(topic, context_chunks)
            questions.append({"topic": topic, "questions": question_data})

        topics_data, questions_data = format_output(main_topics, questions)
        save_to_json(topics_data, "topics.json")
        save_to_json(questions_data, "questions.json")

        print("RAG-based question generation completed and saved to JSON!")
    except Exception as e:
        print(f"An error occurred: {e}")
