In [None]:
from PyPDF2 import PdfReader
from datetime import datetime
import json
import openai
import os

from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OpenAI API key is missing. Please set it in the .env file.")
openai.api_key = OPENAI_API_KEY

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        list: A list of text segments, one for each page.

    Raises:
        FileNotFoundError: If the PDF file does not exist.
        ValueError: If the PDF file cannot be read.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at path: {pdf_path}")

    try:
        reader = PdfReader(pdf_path)
        text_segments = [page.extract_text() for page in reader.pages]
        if not text_segments:
            raise ValueError("PDF text extraction returned empty content.")
        return text_segments
    except Exception as e:
        raise ValueError(f"Error extracting text from PDF: {e}")

def extract_keywords(text_segments):
    """
    Extracts main keywords from the provided text using OpenAI.

    Args:
        text_segments (list): List of text segments.

    Returns:
        list: A list of keywords.

    Raises:
        RuntimeError: If the OpenAI API call fails.
    """
    try:
        joined_text = " ".join(text_segments[:3])  # Using the first 3 pages
        prompt = f"Extract the main keywords from the following text:\n{joined_text}\n\nProvide the keywords as a comma-separated list."
        response = openai.ChatCompletion.create(
            model="gpt-4-0613",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500
        )
        keywords = response.choices[0].message['content'].split(",")
        return [keyword.strip() for keyword in keywords if keyword.strip()]
    except Exception as e:
        raise RuntimeError(f"Error extracting keywords using OpenAI: {e}")

def generate_question_from_keyword(keyword, context_text):
    """
    Generates a multiple-choice question based on a keyword and context.

    Args:
        keyword (str): The keyword to base the question on.
        context_text (str): Contextual text for generating the question.

    Returns:
        dict: A dictionary containing the question, options, and the correct answer.

    Raises:
        RuntimeError: If the OpenAI API call fails.
    """
    try:
        prompt = f"""
        Based on the following keyword: "{keyword}" and the context: "{context_text}",
        generate:
        - One multiple-choice question
        - Four options labeled A, B, C, and D
        - The correct answer with its label (e.g., 'A', 'B', etc.)

        Provide the output in this format:
        {{
            "question": "<Question text>",
            "options": ["A) <Option 1>", "B) <Option 2>", "C) <Option 3>", "D) <Option 4>"],
            "correct_answer": "<Correct Option Label>"
        }}
        """
        response = openai.ChatCompletion.create(
            model="gpt-4-0613",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500
        )
        return json.loads(response.choices[0].message['content'])
    except Exception as e:
        raise RuntimeError(f"Error generating question using OpenAI for keyword '{keyword}': {e}")

def format_output(keywords, questions):
    """
    Formats the output data for keywords and questions into JSON-ready dictionaries.

    Args:
        keywords (list): List of extracted keywords.
        questions (list): List of generated questions.

    Returns:
        tuple: A tuple containing two dictionaries: topics_data and questions_data.
    """
    topics_data = {
        "book_title": "Project Management Professional Guide",
        "total_keywords": len(keywords),
        "extraction_timestamp": datetime.now().isoformat(),
        "keywords": keywords
    }

    questions_data = {
        "metadata": {
            "generated_at": datetime.now().isoformat(),
            "total_questions": len(questions),
            "book_title": "Project Management Professional Guide",
            "tool_used": "GPT-4"
        },
        "questions": questions
    }
    return topics_data, questions_data

def save_to_json(data, filename):
    """
    Saves data to a JSON file.

    Args:
        data (dict): Data to save.
        filename (str): Path to the output JSON file.

    Raises:
        IOError: If the file cannot be written.
    """
    try:
        with open(filename, "w") as f:
            json.dump(data, f, indent=4)
    except Exception as e:
        raise IOError(f"Error saving data to JSON file '{filename}': {e}")

if __name__ == "__main__":
    try:
        pdf_path = "Project.pdf"
        text_segments = extract_text_from_pdf(pdf_path)

        print("Extracting keywords...")
        keywords = extract_keywords(text_segments)
        print(f"Keywords extracted: {keywords}")

        print("Generating questions...")
        questions = []
        context = " ".join(text_segments[:3])
        for keyword in keywords:
            question_data = generate_question_from_keyword(keyword, context)
            question_data["topic"] = keyword
            questions.append(question_data)
        print(f"Generated {len(questions)} questions.")

        print("Formatting output...")
        topics_data, questions_data = format_output(keywords, questions)

        print("Saving to JSON files...")
        save_to_json(topics_data, "keywords.json")
        save_to_json(questions_data, "questions.json")

        print("Keyword-based question generation completed and saved to JSON!")
    except Exception as e:
        print(f"An error occurred: {e}")