In [3]:
import google.generativeai as genai
import os
import json
import sys
import time
from dotenv import load_dotenv
import google.api_core.exceptions
from pathlib import Path
import random

In [4]:
# --- Configuration ---
# Set your API Key (replace 'YOUR_API_KEY' or use environment variable)
load_dotenv()  # Load environment variables from .env file if it exists
API_KEY = os.getenv('GOOGLE_API_KEY')
# Or uncomment and set directly (less secure):
# API_KEY = 'YOUR_API_KEY'

if not API_KEY:
    raise ValueError("GOOGLE_API_KEY environment variable not set.")

genai.configure(api_key=API_KEY)

PDF_FOLDER = Path("./PDFs")
OUTPUT_FOLDER = Path("./descriptions_summaries")
# MODEL_NAME = "gemini-1.5-flash-8b"
MODEL_NAME = "gemini-2.0-flash-lite-001"

# Create output directory if it doesn't exist
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

# Model Configuration (Optional)
generation_config = {
    "temperature": 0.7,
    "top_p": 1,
    "top_k": 1,
    # Adjust max_output_tokens if needed based on expected description lengths
    "max_output_tokens": 8192,
    "response_mime_type": "application/json", # Ask Gemini to output JSON directly
}

safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]

# --- Helper Function to build the prompt ---
def build_prompt(video_filename_stem):
    return rf"""
Analyze the provided PDF file. Each page contains a chronologically ordered image sampled from a video named '{video_filename_stem}'.

Instructions:
1. For EACH page (image) in the PDF:
    a. Generate a short, coherent description in Chinese. Be logical based on the sequence of images.
    b. Identify distinct content blocks within the image (visual elements, text).
    c. For each content block, specify its 'content' (the description or the text itself), 'type' (e.g., '图像', '中文文字', '英文文字'), and 'position' (e.g., '正中央', '左上角', '最下方中间').
    d. If the content is non-Chinese text, provide the original text followed by a Chinese translation in the format: '<Original Text>（翻译：<Translated Text>）'.
    e. Ignore any consistent watermarks often found in corners (in most cases "bilibili")
2. After analyzing ALL pages:
    a. Generate a concise overall 'title' for the video in Chinese (around 15-30 characters).
    b. Generate a 'summary' of the video's key events or theme in Chinese (around 50-100 words).
3. Return the *entire* result STRICTLY as a single JSON object matching the following structure. Do NOT include any text before or after the JSON object.

```json
{{
  "video": "{video_filename_stem}",
  "descriptions": [
    {{"image0": [ {{"content": "...", "type": "...", "position": "..."}}, ... ]}},
    {{"image1": [ {{"content": "...", "type": "...", "position": "..."}}, ... ]}},
    // ... continue for all pages/images, numbering sequentially starting from image0
  ],
  "title": "...",
  "summary": "..."
}}
```
Ensure the final output is valid JSON.
"""

# --- Main Processing Loop ---
print(f"Starting PDF analysis from folder: {PDF_FOLDER}")
print(f"Outputting JSON to folder: {OUTPUT_FOLDER}")
print("-" * 30)

processed_count = 0
skipped_count = 0
error_count = 0

# List all PDF files in the input directory
pdf_files = sorted([f for f in PDF_FOLDER.glob("*.pdf") if f.is_file()], reverse=True)
random.shuffle(pdf_files)

if not pdf_files:
    print(f"No PDF files found in {PDF_FOLDER}. Exiting.")
else:
    print(f"Found {len(pdf_files)} PDF files to potentially process.")

for pdf_path in pdf_files:
    video_name = pdf_path.stem # Get filename without extension
    output_json_path = OUTPUT_FOLDER / f"{video_name}.json"
    print(f"\nChecking: {pdf_path.name}")

    # Skip if JSON already exists
    if output_json_path.exists():
        print(f"  Skipping: Output file '{output_json_path.name}' already exists.")
        skipped_count += 1
        continue

    print(f"  Processing: {pdf_path.name}")

    try:
        # 1. Upload the PDF file
        print(f"    Uploading '{pdf_path.name}'...")
        # The API automatically detects the MIME type for PDF when uploading
        uploaded_file = genai.upload_file(path=pdf_path)
        print(f"    Upload successful: {uploaded_file.name}")

        # Add a small delay after upload, sometimes helpful
        time.sleep(2)

        # 2. Prepare the prompt and input for the model
        prompt_text = build_prompt(video_name)
        model_input = [prompt_text, uploaded_file] # Combine prompt and file

        # 3. Instantiate the model
        model = genai.GenerativeModel(
            model_name=MODEL_NAME,
            safety_settings=safety_settings,
            generation_config=generation_config
        )

        # 4. Generate content
        print("    Generating descriptions and summary...")
        response = model.generate_content(model_input, stream=False) # Use stream=False for simpler handling

        # 5. Process the response
        print("    Parsing response...")
        response_text = response.text

        # Clean potential markdown code fences if Gemini wraps the JSON
        if response_text.strip().startswith("```json"):
            response_text = response_text.strip()[7:-3].strip()
        elif response_text.strip().startswith("```"):
            response_text = response_text.strip()[3:-3].strip()


        # Validate and save JSON
        try:
            result_json = json.loads(response_text)

            # Basic validation (check top-level keys) - can be more thorough
            required_keys = ["video", "descriptions", "title", "summary"]
            if not all(key in result_json for key in required_keys):
                raise ValueError("Generated JSON missing required keys.")
            if not isinstance(result_json["descriptions"], list):
                raise ValueError("'descriptions' field is not a list.")

            # Ensure the video name matches (optional but good practice)
            if result_json.get("video") != video_name:
                print(f"    Warning: Generated video name '{result_json.get('video')}' doesn't match filename '{video_name}'. Using filename.")
                result_json["video"] = video_name

            # Save the valid JSON
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(result_json, f, ensure_ascii=False, indent=2)
            print(f"    Success: Saved results to '{output_json_path.name}'")
            processed_count += 1

            # Clean up the uploaded file from the API storage after successful processing
            print(f"    Deleting uploaded file '{uploaded_file.name}' from API storage...")
            try:
                genai.delete_file(uploaded_file.name)
                print("    Uploaded file deleted successfully.")
            except Exception as delete_err:
                print(f"    Warning: Could not delete uploaded file '{uploaded_file.name}': {delete_err}")


        except json.JSONDecodeError as json_err:
            print(f"  ERROR: Failed to parse JSON response for '{pdf_path.name}'. Skipping.")
            print(f"    Error details: {json_err}")
            print(f"    Received text: {response_text[:500]}...") # Print beginning of bad response
            error_count += 1
            # Attempt to delete the uploaded file even if parsing failed
            print(f"    Attempting to delete uploaded file '{uploaded_file.name}' after error...")
            try:
                genai.delete_file(uploaded_file.name)
                print("    Uploaded file deleted successfully.")
            except Exception as delete_err:
                print(f"    Warning: Could not delete uploaded file '{uploaded_file.name}': {delete_err}")


        except ValueError as val_err:
            print(f"  ERROR: Generated JSON structure invalid for '{pdf_path.name}'. Skipping.")
            print(f"    Error details: {val_err}")
            print(f"    Received JSON: {result_json}") # Print the invalid JSON structure
            error_count += 1
            # Attempt to delete the uploaded file even if validation failed
            print(f"    Attempting to delete uploaded file '{uploaded_file.name}' after error...")
            try:
                genai.delete_file(uploaded_file.name)
                print("    Uploaded file deleted successfully.")
            except Exception as delete_err:
                print(f"    Warning: Could not delete uploaded file '{uploaded_file.name}': {delete_err}")


    except Exception as e:
        print(f"  ERROR: An unexpected error occurred while processing '{pdf_path.name}': {e}")
        error_count += 1
        # If file was uploaded before error, try to delete it (variable might not exist if upload failed)
        if 'uploaded_file' in locals() and uploaded_file is not None and hasattr(uploaded_file, 'name'):
            print(f"    Attempting to delete potentially uploaded file '{uploaded_file.name}' after error...")
            try:
                genai.delete_file(uploaded_file.name)
                print("    Uploaded file deleted successfully.")
            except Exception as delete_err:
                print(f"    Warning: Could not delete uploaded file '{uploaded_file.name}': {delete_err}")
        # Reset variable to avoid issues in next loop iteration's error handling
        uploaded_file = None


    # Optional: Add a delay between processing files to avoid hitting rate limits
    time.sleep(5) # Adjust sleep time as needed (e.g., 2-10 seconds)
    print("-" * 30)
    print("Processing Complete.")
    print(f"Successfully processed: {processed_count} files")
    print(f"Skipped (already exist): {skipped_count} files")
    print(f"Errors encountered: {error_count} files")

Starting PDF analysis from folder: PDFs
Outputting JSON to folder: descriptions_summaries
------------------------------
Found 4199 PDF files to potentially process.

Checking: BV1XQ1PYXE3r.pdf
  Processing: BV1XQ1PYXE3r.pdf
    Uploading 'BV1XQ1PYXE3r.pdf'...
    Upload successful: files/y2v62ulk859c
    Generating descriptions and summary...
    Parsing response...
    Success: Saved results to 'BV1XQ1PYXE3r.json'
    Deleting uploaded file 'files/y2v62ulk859c' from API storage...
    Uploaded file deleted successfully.
------------------------------
Processing Complete.
Successfully processed: 1 files
Skipped (already exist): 0 files
Errors encountered: 0 files

Checking: BV1L4PaeJEdj.pdf
  Processing: BV1L4PaeJEdj.pdf
    Uploading 'BV1L4PaeJEdj.pdf'...
    Upload successful: files/hgpeutey15xc
    Generating descriptions and summary...
    Parsing response...
    Success: Saved results to 'BV1L4PaeJEdj.json'
    Deleting uploaded file 'files/hgpeutey15xc' from API storage...
    U