In [2]:
#@title 1. Install Necessary Libraries
# Ensure this is run in your environment if these libraries are not already installed.
# In a Colab notebook, you would run:
# !pip install -q -U google-generativeai pdfplumber faiss-cpu numpy

#@title 2. Import Libraries and Set Up Gemini API Key
import json
import os
import time
import re
import numpy as np
from getpass import getpass

import google.generativeai as genai
import pdfplumber
import faiss # For FAISS

# --- Gemini API Key Setup ---
API_KEY_CONFIGURED = False
GENERATIVE_MODEL_INSTANCE = None # Will be initialized after API key configuration
EMBEDDING_MODEL_NAME = "models/text-embedding-004"
GENERATIVE_MODEL_NAME = 'gemini-1.5-flash-latest'
LLM_CALL_DELAY_SECONDS = 2 # Delay between LLM calls for rate limiting

# Try to get API key from Colab secrets first, then prompt if not found
try:
    from google.colab import userdata
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    if not GEMINI_API_KEY:
        print("GEMINI_API_KEY not found in Colab secrets. Please enter it manually.")
        GEMINI_API_KEY = getpass("Enter your Gemini API Key: ")
    else:
        print("Gemini API Key loaded from Colab secrets.")
except ImportError: # Not in Colab
    # Attempt to get from environment variable if not in Colab
    GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
    if GEMINI_API_KEY:
        print("Gemini API Key loaded from environment variable.")
    else:
        print("Not running in Colab or 'userdata' not available, and GEMINI_API_KEY environment variable not set. Please enter API key manually.")
        GEMINI_API_KEY = getpass("Enter your Gemini API Key: ")
except userdata.SecretNotFoundError: # Secret exists but no value
    print("GEMINI_API_KEY secret not found in Colab. Please enter it manually.")
    GEMINI_API_KEY = getpass("Enter your Gemini API Key: ")
except Exception as e: # Other userdata errors
    print(f"Error accessing Colab secrets: {e}. Please enter API key manually.")
    GEMINI_API_KEY = getpass("Enter your Gemini API Key: ")


if not GEMINI_API_KEY:
    print("ERROR: Gemini API Key is required to run this script.")
else:
    try:
        genai.configure(api_key=GEMINI_API_KEY)
        GENERATIVE_MODEL_INSTANCE = genai.GenerativeModel(GENERATIVE_MODEL_NAME)
        API_KEY_CONFIGURED = True
        print(f"Gemini API Key configured. Generative model '{GENERATIVE_MODEL_NAME}' and Embedding model '{EMBEDDING_MODEL_NAME}' will be used.")
    except Exception as e:
        print(f"Error configuring Gemini API or initializing model: {e}")
        API_KEY_CONFIGURED = False

#@title 3. Combined RAG Pipeline: File Upload, Processing, Validation

# FIX: Define MockFiles unconditionally at a higher scope.
class MockFiles:
    def upload(self):
        print("File upload using 'files.upload()' is primarily for Google Colab.")
        print("Please place your files in the same directory as the script or provide full paths.")
        # Simulate asking for paths instead of uploading
        uploaded_files = {}
        print("Simulating file input. Please enter local paths when prompted (if applicable in non-Colab):")
        # This mock won't actually make files available unless paths are handled differently later
        return uploaded_files

    def download(self, filename):
        print(f"Download of '{filename}' using 'files.download()' is primarily for Google Colab.")
        print(f"The file is available at ./{filename} in the local filesystem.")

# Attempt to import Colab 'files', and assign it or an instance of MockFiles
try:
    from google.colab import files # For Colab-specific file operations
    print("Successfully imported 'google.colab.files'. Using Colab file U/D capabilities.")
except ImportError:
    print("Warning: 'google.colab.files' not found. File upload functionality will be limited if not in Colab.")
    files = MockFiles() # Assign an instance of our mock object if import fails


# --- File Upload Section ---
print("--- Uploading Files ---")
FILES_UPLOADED_SUCCESSFULLY = False
syllabus_json_path = None
q_paper_json_path = None
textbook_pdf_paths = []

def upload_file_and_get_path(prompt_message, expected_extension):
    """
    Prompts for a single file upload (primarily for Colab) or asks for a path.
    """
    print(prompt_message)
    uploaded_content = None
    filename_to_process = None

    # Check if 'files' is the actual Colab module or our MockFiles instance
    # MockFiles is now guaranteed to be defined.
    # isinstance(files, MockFiles) will be true if 'from google.colab import files' failed.
    is_colab_files_module = hasattr(files, 'upload') and not isinstance(files, MockFiles)

    if is_colab_files_module:
        uploaded = files.upload() # Colab's file upload dialog
        if not uploaded:
            print(f"No file uploaded for: {prompt_message.split(':')[0]}")
            return None
        # 'uploaded' is a dict like {'filename.ext': b'content'}
        filename_to_process = next(iter(uploaded))
        uploaded_content = uploaded[filename_to_process]
        # Save to local disk in Colab environment to have a consistent path
        filepath = f"./{filename_to_process}"
        with open(filepath, "wb") as f:
            f.write(uploaded_content)
        print(f"Uploaded and saved '{filename_to_process}' via Colab. It is available at {filepath}")
        final_path = filepath
    else: # Non-Colab or MockFiles scenario (e.g., local execution)
        filename_input = input(f"Enter the local path for {prompt_message.split(':')[0].lower()}: ")
        if not os.path.exists(filename_input):
            print(f"File not found at path: {filename_input}")
            return None
        filename_to_process = os.path.basename(filename_input) # Get filename from path
        final_path = filename_input # Use the provided path directly
        print(f"Using file from path: {final_path}")

    if not filename_to_process.lower().endswith(expected_extension):
        print(f"Warning: Expected a '{expected_extension}' file for {prompt_message.split(':')[0]}, but got '{filename_to_process}'.")

    return final_path


if API_KEY_CONFIGURED: # Only proceed to upload if API key is set
    syllabus_json_path = upload_file_and_get_path(
        "Please upload your Syllabus JSON file (e.g., operating_systems_syllabus.json):", ".json"
    )
    q_paper_json_path = upload_file_and_get_path(
        "Please upload your Question Paper JSON file (in the flat list format, e.g., [{'question': 'Unit I - 1a', 'text': '...'}, ...]):", ".json"
    )

    # Textbook PDF Upload Handling
    print("\nPlease upload your Textbook PDF file(s).")
    is_colab_files_module_for_textbooks = hasattr(files, 'upload') and not isinstance(files, MockFiles)

    if is_colab_files_module_for_textbooks: # Colab upload
        print("You can select multiple PDF files in the Colab file dialog.")
        uploaded_textbooks = files.upload() # This is a blocking call
        if not uploaded_textbooks:
            print("No textbook PDF files were uploaded via Colab dialog.")
        else:
            for filename_str, content_bytes in uploaded_textbooks.items():
                if filename_str.lower().endswith(".pdf"):
                    filepath = f"./{filename_str}" # Save in current directory
                    with open(filepath, 'wb') as f_out:
                        f_out.write(content_bytes)
                    print(f"Uploaded and saved '{filename_str}'. It is available at {filepath}")
                    textbook_pdf_paths.append(filepath)
                else:
                    print(f"Warning: Expected a '.pdf' file for textbook, but got '{filename_str}'. It will be ignored.")
            if not textbook_pdf_paths:
                print("No valid textbook PDF files (ending with .pdf) were processed from the Colab upload.")
    else: # Non-Colab: Ask for paths one by one
        print("Enter local paths for your textbook PDF files. Enter 'done' when finished.")
        while True:
            pdf_path_input = input("Path to textbook PDF (or 'done'): ")
            if pdf_path_input.lower() == 'done':
                break
            if os.path.exists(pdf_path_input) and pdf_path_input.lower().endswith(".pdf"):
                textbook_pdf_paths.append(pdf_path_input)
                print(f"Added textbook: {pdf_path_input}")
            elif not pdf_path_input.lower().endswith(".pdf"):
                print(f"File '{pdf_path_input}' is not a PDF. Please provide a .pdf file.")
            else:
                print(f"File not found at path: {pdf_path_input}")
        if not textbook_pdf_paths:
            print("No textbook PDF paths were provided.")


    # Check for successful uploads
    if syllabus_json_path and q_paper_json_path and textbook_pdf_paths:
        FILES_UPLOADED_SUCCESSFULLY = True
        print(f"\nSyllabus JSON, Question Paper JSON, and {len(textbook_pdf_paths)} Textbook PDF(s) appear to be available.")
    else:
        error_msg_parts = []
        if not syllabus_json_path: error_msg_parts.append("Syllabus JSON")
        if not q_paper_json_path: error_msg_parts.append("Question Paper JSON")
        if not textbook_pdf_paths: error_msg_parts.append("Textbook PDF(s)")
        print(f"\nERROR: The following required files/types were not sufficiently provided: {', '.join(error_msg_parts)}. Cannot proceed with RAG pipeline.")

else:
    print("API Key not configured. File uploads skipped. Cannot proceed with RAG pipeline.")

# --- Document Processing and Vector Store Creation Utilities ---
if FILES_UPLOADED_SUCCESSFULLY and API_KEY_CONFIGURED:
    # Helper class for Documents
    class SimpleDocument:
        def __init__(self, page_content, metadata=None):
            self.page_content = page_content
            self.metadata = metadata if metadata is not None else {}
        def __repr__(self):
            return f"SimpleDocument(page_content='{self.page_content[:50].replace(chr(10), ' ')}...', metadata={self.metadata})"

    # Text Chunking Utilities
    def chunk_text_by_paragraphs(text, min_chunk_len=50, max_chunk_len=700):
        if not text: return []
        paragraphs = re.split(r'\n\s*\n+', text.strip())
        chunks = []
        current_chunk = ""
        for para in paragraphs:
            para = para.strip()
            if not para: continue
            if not current_chunk: current_chunk = para
            elif len(current_chunk) + len(para) + 1 <= max_chunk_len: # +1 for newline
                current_chunk += "\n\n" + para
            else:
                if len(current_chunk) >= min_chunk_len: chunks.append(current_chunk)
                current_chunk = para
        if current_chunk and len(current_chunk) >= min_chunk_len: chunks.append(current_chunk)

        final_chunks = []
        for chunk in chunks:
            if len(chunk) > max_chunk_len: # Further split if a single paragraph was too long
                for i in range(0, len(chunk), max_chunk_len):
                    final_chunks.append(chunk[i:i+max_chunk_len])
            elif len(chunk) >= min_chunk_len: # Ensure chunk is not too small
                 final_chunks.append(chunk)
        return final_chunks

    # Syllabus Processing
    def normalize_unit_id(unit_id_str):
        if not unit_id_str: return "UNIT-UNKNOWN"
        normalized = unit_id_str.upper().replace(" ", "").replace("_", "")
        match = re.match(r"(UNIT)([IVXLCDM\d]+)", normalized)
        if match:
            return f"{match.group(1)}-{match.group(2)}"
        return normalized

    def load_syllabus_from_json(file_path):
        documents = []
        if not file_path or not os.path.exists(file_path):
            print(f"Syllabus file not found or path is None: {file_path}")
            return documents
        try:
            with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f)
            course_name = data.get("course_name", "Unknown Course")
            print(f"Processing syllabus for course: {course_name}")
            for unit_data in data.get("units", []):
                unit_id_val = normalize_unit_id(unit_data.get("unit", "Unknown Unit"))
                unit_title_val = unit_data.get("title", "Untitled")
                syllabus_content = unit_data.get("syllabus_content", "").strip()
                if syllabus_content:
                    content_chunks = chunk_text_by_paragraphs(syllabus_content, min_chunk_len=50, max_chunk_len=400)
                    if not content_chunks and syllabus_content: content_chunks = [syllabus_content]
                    for i, chunk_cont in enumerate(content_chunks):
                        documents.append(SimpleDocument(chunk_cont, {"source_type": "syllabus", "course_name": course_name, "unit_id": unit_id_val, "unit_title": unit_title_val, "chunk_id": f"syl_chunk_{unit_id_val.replace('-', '')}_{i}"}))
            print(f"Successfully processed syllabus into {len(documents)} document chunks.")
        except Exception as e: print(f"Error loading or processing syllabus JSON from '{file_path}': {e}")
        return documents

    # Textbook Processing
    def extract_text_from_pdf_paged(pdf_path):
        pages_text_content = []
        if not pdf_path or not os.path.exists(pdf_path):
            print(f"Textbook PDF file not found or path is None: {pdf_path}")
            return pages_text_content
        try:
            with pdfplumber.open(pdf_path) as pdf:
                print(f"Opening textbook PDF: '{os.path.basename(pdf_path)}', {len(pdf.pages)} pages found.")
                for i, page in enumerate(pdf.pages):
                    text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=False)
                    if text and text.strip():
                        cleaned_text = re.sub(r'\n\s*\n+', '\n\n', text.strip())
                        pages_text_content.append({"page_number": i + 1, "text": cleaned_text})
                print(f"Successfully extracted text from {len(pages_text_content)} pages of the textbook '{os.path.basename(pdf_path)}'.")
        except Exception as e: print(f"Error reading or processing PDF '{pdf_path}': {e}")
        return pages_text_content

    def prepare_textbook_documents(pdf_paths_list):
        all_textbook_documents = []
        if not pdf_paths_list:
            print("No textbook PDF paths provided for document preparation.")
            return all_textbook_documents

        for pdf_path in pdf_paths_list:
            if not pdf_path or not os.path.exists(pdf_path):
                print(f"Textbook PDF file not found or path is invalid: '{pdf_path}'. Skipping.")
                continue

            print(f"\n--- Processing Textbook File: {os.path.basename(pdf_path)} ---")
            pages_data = extract_text_from_pdf_paged(pdf_path)
            if not pages_data:
                print(f"No text extracted from '{os.path.basename(pdf_path)}'.")
                continue

            doc_name = os.path.basename(pdf_path)
            safe_doc_name_part = re.sub(r'[^a-zA-Z0-9_-]', '_', doc_name.replace('.pdf', ''))

            full_textbook_text = "\n\n".join([page_info["text"] for page_info in pages_data])
            textbook_chunks = chunk_text_by_paragraphs(full_textbook_text, min_chunk_len=200, max_chunk_len=800)

            chunk_id_counter_for_this_book = 0
            current_book_documents_count = 0
            for chunk_content in textbook_chunks:
                all_textbook_documents.append(SimpleDocument(
                    chunk_content,
                    {
                        "source_type": "textbook",
                        "document_name": doc_name,
                        "chunk_id": f"tb_chunk_{safe_doc_name_part}_{chunk_id_counter_for_this_book}"
                    }
                ))
                chunk_id_counter_for_this_book += 1
                current_book_documents_count += 1

            if current_book_documents_count > 0:
                print(f"Successfully processed textbook '{doc_name}' into {current_book_documents_count} document chunks.")
            else:
                print(f"No document chunks created from textbook '{doc_name}'.")

        if all_textbook_documents:
            print(f"\nTotal textbook documents created from all {len(pdf_paths_list)} PDF(s): {len(all_textbook_documents)} chunks.")
        else:
            print("\nNo document chunks created from any of the provided textbook PDFs.")
        return all_textbook_documents

    # Embedding and Vector Store Creation
    def get_gemini_embeddings(texts, task_type="RETRIEVAL_DOCUMENT", model_name=EMBEDDING_MODEL_NAME):
        if not API_KEY_CONFIGURED: print("ERROR: API key not configured."); return []
        if not texts: print("Warning: No texts for embeddings."); return []
        embeddings_list = []
        BATCH_SIZE = 100
        num_batches = (len(texts) - 1) // BATCH_SIZE + 1
        print(f"Total texts to embed: {len(texts)}, in {num_batches} batches using {model_name}.")
        for i in range(0, len(texts), BATCH_SIZE):
            batch_texts = texts[i:i + BATCH_SIZE]
            valid_texts_for_api = []
            original_indices_in_batch = []
            for idx_in_batch, text in enumerate(batch_texts):
                if text and text.strip():
                    valid_texts_for_api.append(text)
                    original_indices_in_batch.append(idx_in_batch)

            if not valid_texts_for_api:
                embeddings_list.extend([None] * len(batch_texts))
                print(f"  Skipped embedding batch {i//BATCH_SIZE + 1}/{num_batches} as it contained no valid text after stripping.")
                continue
            try:
                result = genai.embed_content(model=model_name, content=valid_texts_for_api, task_type=task_type)
                current_batch_embeddings_results = [None] * len(batch_texts)
                for api_idx, original_idx in enumerate(original_indices_in_batch):
                    current_batch_embeddings_results[original_idx] = result['embedding'][api_idx]
                embeddings_list.extend(current_batch_embeddings_results)
                print(f"  Embedded batch {i//BATCH_SIZE + 1}/{num_batches} (API call with {len(valid_texts_for_api)} items).")
                time.sleep(1.0)
            except Exception as e:
                print(f"Error generating embeddings for batch starting at index {i}: {e}")
                embeddings_list.extend([None] * len(batch_texts))
                if "quota" in str(e).lower() or (hasattr(e, 'message') and "quota" in str(e.message).lower()):
                    print("Quota possibly exceeded. Halting embedding generation.")
                    break
        return embeddings_list

    class SimpleVectorStoreFAISS:
        def __init__(self, documents, embeddings_list):
            self.documents = []
            self.faiss_index = None
            self.dimension = 0
            if not documents : print("Warning: No documents provided to SimpleVectorStoreFAISS."); return
            if not embeddings_list: print("Warning: No embeddings list for SimpleVectorStoreFAISS."); return

            valid_embeddings_np_list = []
            temp_documents = []
            for i, emb in enumerate(embeddings_list):
                if i < len(documents):
                    if emb is not None and len(emb) > 0:
                        temp_documents.append(documents[i])
                        valid_embeddings_np_list.append(emb)

            if not valid_embeddings_np_list:
                print("No valid embeddings available for FAISS index after filtering Nones or empty embeddings.")
                self.documents = temp_documents
                return
            self.documents = temp_documents

            try:
                embeddings_np = np.array(valid_embeddings_np_list, dtype='float32')
            except ValueError as ve:
                print(f"Error converting valid embeddings to NumPy array: {ve}.")
                return

            if embeddings_np.ndim != 2 or embeddings_np.shape[0] == 0:
                print(f"Error: Embeddings NumPy array has an unexpected shape: {embeddings_np.shape}.")
                return
            self.dimension = embeddings_np.shape[1]
            if self.dimension == 0:
                print("Error: Embeddings dimension is 0. Cannot build FAISS index.")
                return

            self.faiss_index = faiss.IndexFlatL2(self.dimension)
            self.faiss_index.add(embeddings_np)
            print(f"FAISS index built: {self.faiss_index.ntotal} vectors, dimension {self.dimension}.")
            if self.faiss_index.ntotal != len(self.documents):
                 print(f"Warning: FAISS index count ({self.faiss_index.ntotal}) differs from stored document count ({len(self.documents)}).")


        def search(self, query_text, k=5, task_type="RETRIEVAL_QUERY", embedding_model_name=EMBEDDING_MODEL_NAME):
            if not self.faiss_index or self.faiss_index.ntotal == 0:
                print("FAISS index is empty or not properly initialized. Search cannot be performed.")
                return []
            if not API_KEY_CONFIGURED:
                print("API key not configured. Cannot embed query for search.")
                return []
            if not query_text or not query_text.strip():
                print("Error: Empty query text provided for search.")
                return []

            try:
                query_embedding_result = genai.embed_content(model=embedding_model_name, content=[query_text.strip()], task_type=task_type)
                query_embedding_vector = query_embedding_result['embedding'][0]
            except Exception as e:
                print(f"Error embedding query '{query_text[:50]}...': {e}")
                return []

            if not query_embedding_vector or len(query_embedding_vector) == 0:
                print(f"Failed to get a valid query embedding for '{query_text[:50]}...'.")
                return []

            query_np = np.array([query_embedding_vector], dtype='float32')
            if query_np.shape[1] != self.dimension:
                print(f"Error: Query embedding dimension ({query_np.shape[1]}) does not match FAISS index dimension ({self.dimension}).")
                return []

            actual_k = min(k, self.faiss_index.ntotal)
            if actual_k == 0:
                print("No items in index (actual_k=0), search cannot return results.")
                return []

            distances, indices = self.faiss_index.search(query_np, actual_k)
            retrieved_docs = []
            if indices.size > 0 and indices[0].size > 0 :
                for i in range(len(indices[0])):
                    idx = indices[0][i]
                    if idx != -1 and idx < len(self.documents):
                        retrieved_docs.append({"document": self.documents[idx], "distance": float(distances[0][i])})
            return retrieved_docs

    # Global Variables for Vector Stores
    syllabus_vector_store = None
    textbook_vector_store = None

    # Build Vector Stores
    print("\n--- Building Syllabus Vector Store ---")
    if syllabus_json_path:
        syllabus_docs_list = load_syllabus_from_json(syllabus_json_path)
        if syllabus_docs_list:
            syllabus_texts = [doc.page_content for doc in syllabus_docs_list]
            syllabus_embeddings_list = get_gemini_embeddings(syllabus_texts, task_type="RETRIEVAL_DOCUMENT")
            syllabus_vector_store = SimpleVectorStoreFAISS(syllabus_docs_list, syllabus_embeddings_list)
        else: print("No syllabus docs loaded. Syllabus store not built.")
    else: print("Syllabus JSON path not set. Syllabus store not built.")

    print("\n--- Building Textbook Vector Store ---")
    if textbook_pdf_paths:
        textbook_docs_list = prepare_textbook_documents(textbook_pdf_paths)
        if textbook_docs_list:
            texts_to_embed = [doc.page_content for doc in textbook_docs_list if doc.page_content and doc.page_content.strip()]
            original_docs_for_embedding = [doc for doc in textbook_docs_list if doc.page_content and doc.page_content.strip()]
            if texts_to_embed:
                textbook_embeddings_list_raw = get_gemini_embeddings(texts_to_embed, task_type="RETRIEVAL_DOCUMENT")
                textbook_vector_store = SimpleVectorStoreFAISS(original_docs_for_embedding, textbook_embeddings_list_raw)
            else: print("No valid text (non-empty, non-whitespace) in any textbook docs to embed. Textbook store not built.")
        else: print("No textbook docs processed from any PDF. Textbook store not built.")
    else: print("No Textbook PDF paths available. Textbook store not built.")

    if syllabus_vector_store and syllabus_vector_store.faiss_index and syllabus_vector_store.faiss_index.ntotal > 0:
        print(f"\nSyllabus Vector Store READY: {syllabus_vector_store.faiss_index.ntotal} vectors.")
    else: print("\nSyllabus Vector Store NOT READY or empty.")

    if textbook_vector_store and textbook_vector_store.faiss_index and textbook_vector_store.faiss_index.ntotal > 0:
        print(f"Textbook Vector Store READY: {textbook_vector_store.faiss_index.ntotal} vectors (from {len(textbook_pdf_paths)} PDF(s) processed).")
    else: print("Textbook Vector Store NOT READY or empty.")


    # --- LLM Validation Logic and Question Processing ---
    def format_retrieved_context_for_llm(retrieved_items, source_type_name):
        if not retrieved_items: return f"No relevant {source_type_name} sections found during retrieval."
        context_str = f"--- Relevant {source_type_name} Sections (top {len(retrieved_items)}) ---\n"
        valid_contexts_found = 0
        for i, item in enumerate(retrieved_items):
            if "document" not in item or not hasattr(item["document"], "page_content") or not hasattr(item["document"], "metadata"): continue
            doc = item["document"]
            page_content_clean = str(doc.page_content).strip()
            if not page_content_clean: continue
            valid_contexts_found +=1
            metadata_str_parts = []
            if doc.metadata.get("unit_id"): metadata_str_parts.append(f"Unit: {doc.metadata['unit_id']}")
            if doc.metadata.get("unit_title"): metadata_str_parts.append(f"Title: {doc.metadata['unit_title']}")
            if doc.metadata.get("document_name"): metadata_str_parts.append(f"Doc: {doc.metadata['document_name']}")
            if doc.metadata.get("chunk_id"): metadata_str_parts.append(f"ChunkID: {doc.metadata['chunk_id']}")
            metadata_display = ", ".join(metadata_str_parts) if metadata_str_parts else "N/A"
            context_str += f"Context {valid_contexts_found}: (Metadata: {metadata_display}; Distance: {item.get('distance', -1):.4f})\nContent: {page_content_clean[:700]}...\n\n"
        if valid_contexts_found == 0: return f"No relevant (non-empty) {source_type_name} sections found after filtering."
        return context_str.strip()

    def call_gemini_llm(prompt_text, llm_instance=GENERATIVE_MODEL_INSTANCE):
        if not llm_instance:
            print("ERROR: LLM (GENERATIVE_MODEL_INSTANCE) not initialized.")
            return "ERROR_LLM_UNAVAILABLE"
        try:
            safety_settings_config = [
                {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}
            ]
            generation_config = genai.types.GenerationConfig() # Use default or add specific configs
            response = llm_instance.generate_content(
                prompt_text,
                generation_config=generation_config,
                safety_settings=safety_settings_config
            )
            time.sleep(LLM_CALL_DELAY_SECONDS)

            if not response.candidates or not response.candidates[0].content.parts:
                finish_reason_str = "Unknown"
                if response.prompt_feedback and response.prompt_feedback.block_reason:
                    finish_reason_str = response.prompt_feedback.block_reason.name # Or .name if it's an enum
                elif response.candidates and response.candidates[0].finish_reason:
                     finish_reason_str = response.candidates[0].finish_reason.name # Or .name if it's an enum

                print(f"Warning: LLM response was empty or incomplete. Finish reason: {finish_reason_str}")
                return "ERROR_LLM_EMPTY_RESPONSE"
            return response.text.strip()
        except Exception as e:
            print(f"  Error during LLM call: {e}")
            if "API key not valid" in str(e): return "ERROR_LLM_CALL_API_KEY"
            if "quota" in str(e).lower() or (hasattr(e, 'message') and "quota" in str(e.message).lower()): return "ERROR_LLM_CALL_QUOTA"
            if "DeadlineExceeded" in str(e): return "ERROR_LLM_CALL_TIMEOUT"
            if "response was blocked" in str(e).lower() or (hasattr(e, 'args') and e.args and "blocked" in str(e.args[0]).lower()):
                return "ERROR_LLM_SAFETY_BLOCK"
            return "ERROR_LLM_CALL_GENERAL"

    def check_syllabus_coverage_with_gemini(question_details, retrieved_syllabus_items):
        q_unit, q_num, q_sub, q_text = question_details["q_paper_unit"], question_details["q_paper_num"], question_details["q_paper_subpart"], question_details["question_text"]
        syllabus_context_str = format_retrieved_context_for_llm(retrieved_syllabus_items, "Syllabus")
        prompt = f"""You are an expert academic assistant evaluating if an exam question is covered by a given syllabus.
Question Details from Exam Paper:
Unit (from QP ID): {q_unit}
Question Number (from QP ID): {q_num}{q_sub}
Question Text: "{q_text}"

{syllabus_context_str}

Based *solely* on the "Relevant Syllabus Sections" provided above (if any):
1. Is the question IN SYLLABUS or OUT OF SYLLABUS?
2. If IN SYLLABUS, briefly state which syllabus unit and title appear to cover it from the provided context, and quote the most relevant part of the syllabus text that supports this.
3. If OUT OF SYLLABUS, briefly explain why it is not covered by the provided syllabus sections (e.g., topic not mentioned, depth beyond scope, or no relevant syllabus context found).

Your response MUST start with "SYLLABUS_VERDICT: IN_SYLLABUS" or "SYLLABUS_VERDICT: OUT_OF_SYLLABUS".
Immediately after the verdict, on a new line, provide your reasoning starting with "REASONING: ".
Example IN SYLLABUS response:
SYLLABUS_VERDICT: IN_SYLLABUS
REASONING: The question is covered under syllabus unit 'UNIT-I: Introduction to Operating Systems' as it directly asks about 'System Calls', which is listed in the syllabus content: "System Structures- Operating System Services, System calls...".

Example OUT OF SYLLABUS response:
SYLLABUS_VERDICT: OUT_OF_SYLLABUS
REASONING: The topic of 'Advanced Quantum Entanglement in OS Design' is not mentioned in any of the provided syllabus sections, which focus on classical operating system concepts.
"""
        raw_text_response = call_gemini_llm(prompt)
        verdict, reasoning = "ERROR_PARSING_LLM_RESPONSE", "Could not parse LLM response for syllabus."
        if raw_text_response.startswith("ERROR_LLM_"): verdict, reasoning = raw_text_response, "LLM call failed."
        elif raw_text_response.startswith("SYLLABUS_VERDICT: IN_SYLLABUS"):
            verdict = "IN_SYLLABUS"; reasoning = raw_text_response.split("REASONING:", 1)[1].strip() if "REASONING:" in raw_text_response else "No reasoning from LLM."
        elif raw_text_response.startswith("SYLLABUS_VERDICT: OUT_OF_SYLLABUS"):
            verdict = "OUT_OF_SYLLABUS"; reasoning = raw_text_response.split("REASONING:", 1)[1].strip() if "REASONING:" in raw_text_response else "No reasoning from LLM."
        else: reasoning = f"Unexpected syllabus response format: {raw_text_response[:200]}..."
        return verdict, reasoning

    def check_textbook_coverage_with_gemini(question_details, syllabus_verdict_reasoning, retrieved_textbook_items):
        q_text = question_details["question_text"]
        textbook_context_str = format_retrieved_context_for_llm(retrieved_textbook_items, "Textbook")
        prompt = f"""An expert academic assistant determined a question is IN_SYLLABUS. Now, check textbook coverage.
Question: "{q_text}"
Syllabus Finding: "{syllabus_verdict_reasoning[:300]}..."

{textbook_context_str}

Based ONLY on the "Relevant Textbook Sections" provided above (if any, these may come from one or more textbook files):
1. Is the question's topic substantively covered in the provided textbook excerpts?
2. Answer strictly with "TEXTBOOK_COVERAGE: YES_IN_TEXTBOOK" or "TEXTBOOK_COVERAGE: NO_IN_PROVIDED_TEXTBOOK_EXCERPTS".
3. Immediately after the verdict, on a new line, provide your reasoning starting with "REASONING: ".
   - If YES_IN_TEXTBOOK, briefly explain how and quote relevant part(s) from the textbook context, mentioning the source document if available in metadata.
   - If NO_IN_PROVIDED_TEXTBOOK_EXCERPTS, explain why (e.g., topic not found, mentioned superficially, or context insufficient).

Example YES_IN_TEXTBOOK:
TEXTBOOK_COVERAGE: YES_IN_TEXTBOOK
REASONING: The textbook excerpt from 'Operating_System_Concepts.pdf' discusses 'Process Control Block (PCB)' in detail, stating "Each process is represented in the operating system by a process control block (PCB)...".

Example NO_IN_PROVIDED_TEXTBOOK_EXCERPTS:
TEXTBOOK_COVERAGE: NO_IN_PROVIDED_TEXTBOOK_EXCERPTS
REASONING: While the textbook mentions 'scheduling' in general, the specific 'Fuzzy Logic Scheduling Algorithm' asked in the question is not detailed in the provided excerpts.
"""
        raw_text_response = call_gemini_llm(prompt)
        verdict, reasoning = "ERROR_PARSING_LLM_RESPONSE", "Could not parse LLM response for textbook."
        if raw_text_response.startswith("ERROR_LLM_"): verdict, reasoning = raw_text_response, "LLM call failed."
        elif raw_text_response.startswith("TEXTBOOK_COVERAGE: YES_IN_TEXTBOOK"):
            verdict = "YES_IN_TEXTBOOK"; reasoning = raw_text_response.split("REASONING:", 1)[1].strip() if "REASONING:" in raw_text_response else "No reasoning from LLM."
        elif raw_text_response.startswith("TEXTBOOK_COVERAGE: NO_IN_PROVIDED_TEXTBOOK_EXCERPTS"):
            verdict = "NO_IN_PROVIDED_TEXTBOOK_EXCERPTS"; reasoning = raw_text_response.split("REASONING:", 1)[1].strip() if "REASONING:" in raw_text_response else "No reasoning from LLM."
        else: reasoning = f"Unexpected textbook response format: {raw_text_response[:200]}..."
        return verdict, reasoning

    def process_single_question_entry(unit_qp_name, q_num_qp, sub_part_key_qp, q_text_qp, overall_results_dict):
        if not sub_part_key_qp: sub_part_key_qp = ""
        sub_part_key_qp = re.sub(r'[\W_]+', '', sub_part_key_qp)

        question_id_str = f"{unit_qp_name} Q{q_num_qp}{sub_part_key_qp if sub_part_key_qp else ''}"
        q_text_print_preview = ' '.join(q_text_qp.splitlines())[:80]
        print(f"\nValidating: {question_id_str} - \"{q_text_print_preview}...\"")
        current_question_details = {"q_paper_unit": unit_qp_name, "q_paper_num": q_num_qp, "q_paper_subpart": sub_part_key_qp, "question_text": q_text_qp}
        result_entry = {"question_identifier": question_id_str, "question_text": q_text_qp, "syllabus_status": "NOT_PROCESSED", "syllabus_reasoning": "", "textbook_coverage_status": "NOT_CHECKED", "textbook_reasoning": "", "retrieved_syllabus_context_summary": [], "retrieved_textbook_context_summary": []}

        if not syllabus_vector_store or not syllabus_vector_store.faiss_index or syllabus_vector_store.faiss_index.ntotal == 0:
            result_entry["syllabus_status"], result_entry["syllabus_reasoning"] = "ERROR_NO_SYLLABUS_VS", "Syllabus vector store not ready or empty."
        else:
            retrieved_syllabus = syllabus_vector_store.search(q_text_qp, k=3)
            for item in retrieved_syllabus: result_entry["retrieved_syllabus_context_summary"].append(f"Dist: {item['distance']:.4f}, Meta: {item['document'].metadata}, Content: {item['document'].page_content[:100].replace(chr(10),' ')}...")
            s_status, s_reasoning = check_syllabus_coverage_with_gemini(current_question_details, retrieved_syllabus)
            result_entry["syllabus_status"], result_entry["syllabus_reasoning"] = s_status, s_reasoning
        print(f"  Syllabus Status: {result_entry['syllabus_status']}")
        if result_entry['syllabus_status'].startswith("ERROR"): print(f"  Syllabus Reasoning (Error Detail): {result_entry['syllabus_reasoning']}")

        if result_entry["syllabus_status"] == "IN_SYLLABUS":
            if textbook_vector_store and textbook_vector_store.faiss_index and textbook_vector_store.faiss_index.ntotal > 0 :
                retrieved_textbook = textbook_vector_store.search(q_text_qp, k=3)
                for item in retrieved_textbook: result_entry["retrieved_textbook_context_summary"].append(f"Dist: {item['distance']:.4f}, Meta: {item['document'].metadata}, Content: {item['document'].page_content[:100].replace(chr(10),' ')}...")
                t_status, t_reasoning = check_textbook_coverage_with_gemini(current_question_details, result_entry['syllabus_reasoning'], retrieved_textbook)
                result_entry["textbook_coverage_status"], result_entry["textbook_reasoning"] = t_status, t_reasoning
                print(f"  Textbook Coverage: {t_status}")
                if t_status.startswith("ERROR"): print(f"  Textbook Reasoning (Error Detail): {t_reasoning}")
            else:
                result_entry["textbook_coverage_status"] = "NOT_CHECKED (Textbook VS Unavailable or Empty)"
                print("  Textbook check skipped: Textbook Vector Store not ready or is empty.")
        else:
             result_entry["textbook_coverage_status"] = "NOT_APPLICABLE (OUT_OF_SYLLABUS or SYLLABUS_ERROR)"
             print(f"  Textbook check skipped as question is {result_entry['syllabus_status']}.")
        overall_results_dict["validation_summary"].append(result_entry)

    def validate_question_paper(q_paper_file_path, course_name_default="Operating Systems"):
        overall_results = {"course_name": course_name_default, "validation_summary": [], "errors_encountered": []}
        if not all([FILES_UPLOADED_SUCCESSFULLY, API_KEY_CONFIGURED, GENERATIVE_MODEL_INSTANCE,
                    syllabus_vector_store and syllabus_vector_store.faiss_index and syllabus_vector_store.faiss_index.ntotal > 0]):
            error_msg = "Prerequisites not met for validation: "
            if not FILES_UPLOADED_SUCCESSFULLY: error_msg += "Files not uploaded/available. "
            if not API_KEY_CONFIGURED: error_msg += "API Key not configured. "
            if not GENERATIVE_MODEL_INSTANCE: error_msg += "LLM not initialized. "
            if not syllabus_vector_store or not syllabus_vector_store.faiss_index or syllabus_vector_store.faiss_index.ntotal == 0 : error_msg += "Syllabus VS not ready or empty. "
            overall_results["errors_encountered"].append(error_msg.strip()); print(f"ERROR: {error_msg.strip()}"); return overall_results

        q_paper_data_list = None
        try:
            with open(q_paper_file_path, 'r', encoding='utf-8') as f:
                q_paper_data_loaded = json.load(f)
            if syllabus_vector_store and syllabus_vector_store.documents:
                 overall_results["course_name"] = syllabus_vector_store.documents[0].metadata.get("course_name", course_name_default)
            elif isinstance(q_paper_data_loaded, dict) and "course_name" in q_paper_data_loaded:
                 overall_results["course_name"] = q_paper_data_loaded.get("course_name", course_name_default)

            if isinstance(q_paper_data_loaded, list):
                q_paper_data_list = q_paper_data_loaded
                print(f"Loaded {len(q_paper_data_list)} question entries from flat list question paper.")
            else:
                raise ValueError("Question paper JSON is not in the expected flat list format (a list of question objects).")
        except Exception as e:
            error_msg = f"Error loading or parsing Question Paper JSON from '{q_paper_file_path}': {e}"
            overall_results["errors_encountered"].append(error_msg); print(error_msg); return overall_results

        print(f"\n--- Starting Validation for Course: {overall_results['course_name']} ---")
        question_count = 0
        for idx, question_entry in enumerate(q_paper_data_list):
            question_id_field = question_entry.get("question", f"UNKNOWN_ID_IDX_{idx}")
            q_text_qp = question_entry.get("text", "")
            if not q_text_qp or not q_text_qp.strip():
                print(f"Skipping entry with empty question text, ID: {question_id_field}")
                overall_results["errors_encountered"].append(f"Skipped question with empty text: ID '{question_id_field}'")
                continue

            unit_qp_name_str, q_num_str, sub_part_key_str = "UNKNOWN_UNIT", "0", ""
            match = re.match(r"^(.*?)\s*-\s*(\d+)([a-zA-Z]*)?$", question_id_field.strip())
            if match:
                unit_qp_name_str = match.group(1).strip()
                q_num_str = match.group(2)
                sub_part_key_str = match.group(3) if match.group(3) else ""
            else:
                print(f"Warning: Could not parse question ID '{question_id_field}' with primary regex. Attempting fallback.")
                parts = question_id_field.split(' - ', 1)
                if len(parts) == 2:
                    unit_qp_name_str = parts[0].strip()
                    num_subpart_str = parts[1].strip()
                    num_match = re.match(r"(\d+)([a-zA-Z]*)", num_subpart_str)
                    if num_match: q_num_str, sub_part_key_str = num_match.group(1), (num_match.group(2) if num_match.group(2) else "")
                    else: q_num_str = re.sub(r'\D', '', num_subpart_str); q_num_str = q_num_str if q_num_str else "0"
                else:
                    unit_match_fallback = re.match(r"^(UNIT[\s\-IVXLCDM\d]+)", question_id_field.strip(), re.IGNORECASE)
                    if unit_match_fallback: unit_qp_name_str = unit_match_fallback.group(1).strip()
                    num_match_fallback = re.search(r"(\d+)([a-zA-Z]*)?$", question_id_field.strip())
                    if num_match_fallback:
                        q_num_str = num_match_fallback.group(1)
                        sub_part_key_str = num_match_fallback.group(2) if num_match_fallback.group(2) else ""
                    if unit_qp_name_str == "UNKNOWN_UNIT" and q_num_str == "0":
                         print(f"  Further Warning: Could not reliably parse unit/question number from '{question_id_field}'. Using defaults/extracted parts.")

            unit_qp_name = normalize_unit_id(unit_qp_name_str)
            q_num_qp = q_num_str if q_num_str.isdigit() else "0"
            sub_part_key_qp = sub_part_key_str.lower()
            process_single_question_entry(unit_qp_name, q_num_qp, sub_part_key_qp, q_text_qp, overall_results)
            question_count += 1
        print(f"\n--- Validation Complete. Processed {question_count} question parts. ---")
        if overall_results["errors_encountered"]:
            print("\n--- Errors Encountered During Validation ---")
            for err in overall_results["errors_encountered"]: print(f"- {err}")
        return overall_results

    def main():
        if not (FILES_UPLOADED_SUCCESSFULLY and API_KEY_CONFIGURED):
            print("\nValidation not started. Please ensure API key is configured and files are provided, then re-run.")
            if not API_KEY_CONFIGURED: print("Reason: API Key not configured.")
            if not FILES_UPLOADED_SUCCESSFULLY: print("Reason: Files not successfully uploaded/specified.")
            return

        if not (GENERATIVE_MODEL_INSTANCE and syllabus_vector_store and syllabus_vector_store.faiss_index and syllabus_vector_store.faiss_index.ntotal > 0):
            print("\nERROR: Cannot start validation. Critical components (LLM or Syllabus Vector Store) are not properly initialized or are empty.")
            print(f"  LLM Initialized: {'Yes' if GENERATIVE_MODEL_INSTANCE else 'No'}")
            print(f"  Syllabus Vector Store Ready: {'Yes' if syllabus_vector_store and syllabus_vector_store.faiss_index and syllabus_vector_store.faiss_index.ntotal > 0 else 'No or Empty'}")
            print(f"  Textbook Vector Store Ready: {'Yes' if textbook_vector_store and textbook_vector_store.faiss_index and textbook_vector_store.faiss_index.ntotal > 0 else 'No or Empty (or no textbooks provided)'}")
            return

        print("\n--- Starting Main Validation Execution ---")
        if not q_paper_json_path:
            print("ERROR: Question paper JSON path not available. Cannot run validation.")
            return

        final_results = validate_question_paper(q_paper_json_path)
        timestamp = time.strftime('%Y%m%d-%H%M%S')
        course_name_slug = re.sub(r'\W+', '_', final_results.get("course_name", "course").lower())
        output_filename = f"validation_results_{course_name_slug}_{timestamp}.json"

        try:
            with open(output_filename, 'w', encoding='utf-8') as f:
                json.dump(final_results, f, indent=2, ensure_ascii=False)
            print(f"\nValidation results saved to '{output_filename}'")
            if hasattr(files, 'download') and not isinstance(files, MockFiles): # Check if it's actual Colab files
                files.download(output_filename)
        except Exception as e:
            print(f"Error saving results to JSON: {e}")

    if __name__ == "__main__":
        main()

else:
    print("\nSkipping Document Processing and Validation Stage.")
    if not API_KEY_CONFIGURED:
        print("Reason: API Key was not configured.")
    if API_KEY_CONFIGURED and not FILES_UPLOADED_SUCCESSFULLY:
        print("Reason: Files were not successfully uploaded or specified.")

Not running in Colab or 'userdata' not available, and GEMINI_API_KEY environment variable not set. Please enter API key manually.


Enter your Gemini API Key:  ········


Gemini API Key configured. Generative model 'gemini-1.5-flash-latest' and Embedding model 'models/text-embedding-004' will be used.
--- Uploading Files ---
Please upload your Syllabus JSON file (e.g., operating_systems_syllabus.json):


Enter the local path for please upload your syllabus json file (e.g., operating_systems_syllabus.json):  /Users/arjun/Documents/Automated_Question_Paper_Scrutinization/testfile.json


Using file from path: /Users/arjun/Documents/Automated_Question_Paper_Scrutinization/testfile.json
Please upload your Question Paper JSON file (in the flat list format, e.g., [{'question': 'Unit I - 1a', 'text': '...'}, ...]):


Enter the local path for please upload your question paper json file (in the flat list format, e.g., [{'question':  /Users/arjun/Downloads/pardes-os.json


Using file from path: /Users/arjun/Downloads/pardes-os.json

Please upload your Textbook PDF file(s).
Enter local paths for your textbook PDF files. Enter 'done' when finished.


Path to textbook PDF (or 'done'):  /Users/arjun/Downloads/Abraham Silberschatz-Operating System Concepts (9t_231204_180322.pdf


Added textbook: /Users/arjun/Downloads/Abraham Silberschatz-Operating System Concepts (9t_231204_180322.pdf


Path to textbook PDF (or 'done'):  /Users/arjun/Documents/Automated_Question_Paper_Scrutinization/os unit 5 answers.pdf


Added textbook: /Users/arjun/Documents/Automated_Question_Paper_Scrutinization/os unit 5 answers.pdf


Path to textbook PDF (or 'done'):  done



Syllabus JSON, Question Paper JSON, and 2 Textbook PDF(s) appear to be available.

--- Building Syllabus Vector Store ---
Processing syllabus for course: Operating System
Successfully processed syllabus into 6 document chunks.
Total texts to embed: 6, in 1 batches using models/text-embedding-004.
  Embedded batch 1/1 (API call with 6 items).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

FAISS index built: 6 vectors, dimension 768.

--- Building Textbook Vector Store ---

--- Processing Textbook File: Abraham Silberschatz-Operating System Concepts (9t_231204_180322.pdf ---


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Opening textbook PDF: 'Abraham Silberschatz-Operating System Concepts (9t_231204_180322.pdf', 944 pages found.


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Successfully extracted text from 917 pages of the textbook 'Abraham Silberschatz-Operating System Concepts (9t_231204_180322.pdf'.


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Successfully processed textbook 'Abraham Silberschatz-Operating System Concepts (9t_231204_180322.pdf' into 3337 document chunks.

--- Processing Textbook File: os unit 5 answers.pdf ---
Opening textbook PDF: 'os unit 5 answers.pdf', 31 pages found.


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Successfully extracted text from 31 pages of the textbook 'os unit 5 answers.pdf'.
Successfully processed textbook 'os unit 5 answers.pdf' into 72 document chunks.

Total textbook documents created from all 2 PDF(s): 3409 chunks.
Total texts to embed: 3409, in 35 batches using models/text-embedding-004.
  Embedded batch 1/35 (API call with 100 items).
  Embedded batch 2/35 (API call with 100 items).
  Embedded batch 3/35 (API call with 100 items).
  Embedded batch 4/35 (API call with 100 items).
  Embedded batch 5/35 (API call with 100 items).
  Embedded batch 6/35 (API call with 100 items).
  Embedded batch 7/35 (API call with 100 items).
  Embedded batch 8/35 (API call with 100 items).
  Embedded batch 9/35 (API call with 100 items).
  Embedded batch 10/35 (API call with 100 items).
  Embedded batch 11/35 (API call with 100 items).
  Embedded batch 12/35 (API call with 100 items).
  Embedded batch 13/35 (API call with 100 items).
  Embedded batch 14/35 (API call with 100 items).
  Em