In [91]:
import openai
import numpy as np
from pathlib import Path
from typing_extensions import TypedDict
import fitz  # PyMuPDF
from sklearn.metrics.pairwise import cosine_similarity
import json


# ----------- JSON schema for guard‑railed output ----------------------------
class SummaryJson(TypedDict):
    content: str
    title: str
    description: str
    location: str


# ----------- Helper function to get embeddings ----------------------------
def get_embeddings(text: str, model="text-embedding-ada-002"):
    """
    Gets the embeddings for the provided text using OpenAI's embeddings API.
    """
    response = openai.embeddings.create(
        model=model,
        input=[text]  # The input should be a list of texts
    )
    embeddings = [embedding['embedding'] for embedding in response['data']]
    return np.array(embeddings)


# ----------- main callable --------------------------------------------------
def pdf_to_summary_json(
    pdf_path: str | Path, location_tag: str, model_name: str = "gpt-4"
) -> SummaryJson:
    """
    Extracts text from `pdf_path`, manually splits it, generates embeddings, and returns SummaryJson.
    `location_tag` → value stored in the 'location' field.
    """
    pdf_path = Path(pdf_path)

    # 1) Load PDF and extract text using PyMuPDF (fitz)
    doc = fitz.open(str(pdf_path))  # Open the PDF
    full_text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)  # Load a page
        full_text += page.get_text("text")  # Extract text from page

    # 2) Chunk text manually (let's use 1000 character chunks)
    chunk_size = 1000
    chunks = [full_text[i:i + chunk_size] for i in range(0, len(full_text), chunk_size)]

    # 3) Get embeddings for each chunk
    chunk_embeddings = [get_embeddings(chunk) for chunk in chunks]

    # 4) Generate the query embedding
    query = "Make a test for students based on the lecture material"
    query_embedding = get_embeddings(query)

    # 5) Calculate cosine similarity between query embedding and chunk embeddings
    similarities = [cosine_similarity(query_embedding, chunk_embedding)[0][0] for chunk_embedding in chunk_embeddings]

    # 6) Retrieve top‑k context based on similarity score
    top_k = 6
    top_k_indices = np.argsort(similarities)[-top_k:]
    context = "\n\n".join(chunks[i] for i in top_k_indices)

    # 7) Generate the mock test questions using GPT model
    openai.api_key = 'YOUR_OPENAI_API_KEY'  # Set your OpenAI API key here
    prompt = f"""
    You are a mock quiz generator. Based on the provided lecture notes, create the following types of questions:
    - One answer with 4 options to pick.
    - Multiple choice with 5-6 options to pick.
    - Matching questions.

    The provided lecture content: {context}

    Output the result in the following JSON format:
    [
      {{
        "title": "Who was the first president of the United States?",  // One choice question
        "variants": [
          "George Washington",  // Option 1
          "Abraham Lincoln"  // Option 2
        ],
        "correctVariantIndex": 0  // Correct answer is the first option (index 0)
      }},
      {{
        "title": "Which of the following countries were part of the Allied Powers in World War II?",  // Multiple choice question
        "variants": [
          "United States",  // Option 1
          "Germany",  // Option 2
          "Soviet Union",  // Option 3
          "Italy"  // Option 4
        ],
        "correctVariantIndex": [0, 2]  // Correct answers are the first (United States) and third (Soviet Union)
      }},
      {{
        "title": "Match the famous historical figures to their corresponding country:",  // Matching question
        "variants": [
          "George Washington",  // Option 1
          "Albert Einstein",  // Option 2
          "Winston Churchill",  // Option 3
          "Mahatma Gandhi"  // Option 4
        ],
        "correctVariantIndex": [0, 1, 2, 3],  // Correct matches for each country
        "matchWith": [
          "United States",  // Country for George Washington
          "Germany",  // Country for Albert Einstein
          "United Kingdom",  // Country for Winston Churchill
          "India"  // Country for Mahatma Gandhi
        ]
      }}
    ]

    Ensure that the quiz questions are based only on the provided lecture notes or the test will fail.
    """

    response = openai.ChatCompletion.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    json_out = json.loads(response['choices'][0]['message']['content'])
    return json_out


# ------------- Call the function ---------------



RuntimeError: Directory 'static/' does not exist

In [90]:
!pip install frontend

Collecting frontend
  Downloading frontend-0.0.3-py3-none-any.whl.metadata (847 bytes)
Collecting aiofiles (from frontend)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading frontend-0.0.3-py3-none-any.whl (32 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles, frontend
Successfully installed aiofiles-24.1.0 frontend-0.0.3


In [86]:
if __name__ == "__main__":
    summary = pdf_to_summary_json("lecture4.pdf", location_tag="s3://my-bucket/example.pdf")
    print(json.dumps(summary, indent=2, ensure_ascii=False))

{
  "content": "I'm sorry, but I cannot access or read PDF files directly. However, if you can provide the text or main points from the PDF, I would be happy to help you summarize or analyze it!",
  "title": "PDF Content Inquiry",
  "description": "Request for information about the contents of a PDF file.",
  "location": "N/A"
}


In [99]:
from fastapi import FastAPI, File, UploadFile
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.vectorstores import InMemoryVectorStore
import openai
import json
from io import BytesIO
from pathlib import Path
from typing_extensions import TypedDict

# ----------- JSON schema for guard‑railed output ----------------------------
class SummaryJson(TypedDict):
    content: str
    title: str
    description: str
    location: str

# ----------- Helper function to get embeddings ----------------------------
def get_embeddings(text: str, model="text-embedding-ada-002"):
    """
    Gets the embeddings for the provided text using OpenAI's embeddings API.
    """
    response = openai.embeddings.create(
        model=model,
        input=[text]  # The input should be a list of texts
    )
    embeddings = [embedding['embedding'] for embedding in response['data']]
    return np.array(embeddings)

# ----------- main callable --------------------------------------------------
def pdf_to_summary_json(pdf_file: BytesIO, model_name: str = "gpt-4") -> SummaryJson:
    """
    Extracts text from `pdf_file`, processes it, and returns mock test questions in SummaryJson format.
    """
    # 1) Load & chunk the PDF content using PyPDFLoader and RecursiveCharacterTextSplitter
    pdf_path = Path("uploaded_pdf.pdf")
    with open(pdf_path, "wb") as f:
        f.write(pdf_file.read())

    pages = PyPDFLoader(str(pdf_path)).load()

    # Use RecursiveCharacterTextSplitter to break the document into chunks
    chunks = RecursiveCharacterTextSplitter(
        chunk_size=1_000, chunk_overlap=150
    ).split_documents(pages)

    # 2) Embed & store chunks in RAM
    vectordb = InMemoryVectorStore(OpenAIEmbeddings())
    vectordb.add_documents(chunks)

    # 3) Retrieve top‑k context based on a query
    question = "Make a test for students based on the lecture material"
    docs = vectordb.similarity_search(question, k=6)
    context = "\n\n".join(d.page_content for d in docs)

    # 4) Generate JSON with structured output using GPT
    llm = ChatOpenAI(model_name=model_name, temperature=0)
    structured_llm = llm.with_structured_output(SummaryJson)  # forces valid schema

    prompt = (
        f"""You are a mock quiz generator. Based on the provided lecture notes, create the following types of questions:
    - One answer with 4 options to pick.
    - Multiple choice with 5-6 options to pick.
    - Matching questions.

    The provided lecture content: {context}

    Output the result in the following JSON format:
    [
      {{
        "title": "Who was the first president of the United States?",  // One choice question
        "variants": [
          "George Washington",  // Option 1
          "Abraham Lincoln"  // Option 2
        ],
        "correctVariantIndex": 0  // Correct answer is the first option (index 0)
      }},
      {{"title": "Which of the following countries were part of the Allied Powers in World War II?", 
        "variants": [
          "United States", "Germany", "Soviet Union", "Italy"
        ],
        "correctVariantIndex": [0, 2]
      }},
      {{
        "title": "Match the famous historical figures to their corresponding country:", 
        "variants": [
          "George Washington", "Albert Einstein", "Winston Churchill", "Mahatma Gandhi"
        ],
        "correctVariantIndex": [0, 1, 2, 3],
        "matchWith": [
          "United States", "Germany", "United Kingdom", "India"
        ]
      }}
    ]

    Ensure that the quiz questions are based only on the provided lecture notes or the test will fail."""
    )

    json_out = structured_llm.invoke(prompt)

    return json_out


# ----------- FastAPI setup --------------------------------------------------
app = FastAPI()

@app.post("/generate-quiz/")
async def generate_quiz(file: UploadFile = File(...)):
    """
    Endpoint to generate mock quiz questions from the uploaded PDF file.
    """
    # Read the PDF file content
    pdf_file = await file.read()
    try:
        # Process the PDF and generate the summary JSON
        result = pdf_to_summary_json(BytesIO(pdf_file))
        return json.dumps(result, indent=2, ensure_ascii=False)
    except Exception as e:
        return {"error": str(e)}


In [98]:
!pip install python-multipart

Collecting python-multipart
  Using cached python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Using cached python_multipart-0.0.20-py3-none-any.whl (24 kB)
Installing collected packages: python-multipart
Successfully installed python-multipart-0.0.20


In [95]:
if __name__ == "__main__":
    summary = pdf_to_summary_json("lecture4.pdf", location_tag="s3://my-bucket/example.pdf")
    print(json.dumps(summary, indent=2, ensure_ascii=False))

{
  "quiz": [
    {
      "title": "What is the primary focus of the lecture titled 'Algorithm II'?",
      "variants": [
        "Data Structures",
        "Dynamic Connectivity",
        "Sorting Algorithms",
        "Graph Theory"
      ],
      "correctVariantIndex": 1
    },
    {
      "title": "Which of the following methods is used to connect two objects in the Dynamic-Connectivity Client?",
      "variants": [
        "connect()",
        "union()",
        "link()",
        "merge()",
        "join()"
      ],
      "correctVariantIndex": [
        1
      ]
    },
    {
      "title": "Match the following terms with their definitions:",
      "variants": [
        "Quick-Union",
        "Quick-Find",
        "Dynamic Connectivity",
        "Union-Find"
      ],
      "correctVariantIndex": [
        0,
        1,
        2,
        3
      ],
      "matchWith": [
        "A method to connect components using tree structures.",
        "A method to find components using an ar

