In [12]:
import os
import base64
from google import genai
from dotenv import load_dotenv
import enum

# Load the API key from the .env file
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

# Set up the Gemini API client
client = genai.Client(api_key=api_key)


class ADClassification(enum.Enum):  # Define Enum
    SUPERSEDED = "Superseded"
    NORMAL = "Normal"
    ERROR = "Error"
    INVALID_JSON = "Invalid JSON"  # Add for JSON errors
    INVALID_CLASSIFICATION = "Invalid Classification"

# Function to encode the PDF file to base64
def encode_pdf(pdf_path):
    """Encodes a PDF file to a base64 string."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            pdf_data = pdf_file.read()
            encoded_string = base64.b64encode(pdf_data).decode("utf-8")
        return encoded_string
    except Exception as e:
        print(f"Error encoding {pdf_path}: {e}")
        return None
    
    
def classify_pdf(file_path, prompt):
    # ... (Encode PDF - same as before)
    # Encode the PDF to base64
    encoded_pdf = encode_pdf(file_path)
    if not encoded_pdf:
        return "Error: Unable to encode PDF"
    contents = f"{prompt}\n{encoded_pdf}"

    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=contents,
        config={
            'response_mime_type': 'application/json',
        },
    )

    try:
        response_json = json.loads(response.text)
        classification_str = response_json.get("classification")
        justification = response_json.get("justification")

        try:
            classification = ADClassification(classification_str)  # Convert to Enum
        except ValueError:
            classification = ADClassification.INVALID_CLASSIFICATION  # Enum for invalid
            justification = "LLM returned an invalid classification string."

        return classification, justification

    except json.JSONDecodeError:
        return ADClassification.INVALID_JSON, "Invalid JSON response from LLM"  # Enum
    except Exception as e:
        return ADClassification.ERROR, f"An error occurred: {e}"  # Enum

# Function to classify a PDF (automatically handles OCR if needed)
def old_classify_pdf(file_path, prompt):
    # Encode the PDF to base64
    encoded_pdf = encode_pdf(file_path)
    if not encoded_pdf:
        return "Error: Unable to encode PDF"

    # Combine the prompt with the PDF content for the classification
    contents = f"{prompt}\n{encoded_pdf}"

    # Make the classification request to the Gemini API
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=contents,  # Send the combined prompt and base64-encoded PDF content
        config={
            'response_mime_type': 'text/x.enum',
            'response_schema': {
                "type": "STRING",
                "enum": ["Superseded", "Normal"],
            },
        },
    )

    # Return the classification result
    return response.text

# Directory containing your PDF files
pdf_directory = r"C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\ADs"

# Define the prompt to guide the classification
prompt = """
Classify the following document as 'Superseded' or 'Normal' based on its content.  Respond in JSON format like this:

```json
{{
  "classification": "[Superseded/Normal]",  # EXACTLY "Superseded" or "Normal"
  "justification": "[Explanation of why the document was classified as such]"
}}"""

# Loop over all PDFs in the directory and classify them
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(pdf_directory, filename)
        classification, justification = classify_pdf(file_path, prompt)
        print(f'{filename}: {classification.name}')  # Print enum name
        if justification:
            print(f'  Justification: {justification}')
        print("-" * 20)

AD_2006-0112R1_1.pdf: SUPERSEDED
  Justification: The document contains the string 'LastModified' which suggests it is an older version that has been updated.
--------------------


ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}

In [ ]:
import os
import base64
import json
from enum import Enum  # Import Enum
from google import genai
from dotenv import load_dotenv

# ... (Load API key, set up client - same as before)

class ADClassification(Enum):  # Define Enum
    SUPERSEDED = "Superseded"
    NORMAL = "Normal"
    ERROR = "Error"
    INVALID_JSON = "Invalid JSON"  # Add for JSON errors
    INVALID_CLASSIFICATION = "Invalid Classification"


def classify_pdf(file_path, prompt):
    # ... (Encode PDF - same as before)

    contents = f"{prompt}\n{encoded_pdf}"

    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=contents,
        config={
            'response_mime_type': 'application/json',
        },
    )

    try:
        response_json = json.loads(response.text)
        classification_str = response_json.get("classification")
        justification = response_json.get("justification")

        try:
            classification = ADClassification(classification_str)  # Convert to Enum
        except ValueError:
            classification = ADClassification.INVALID_CLASSIFICATION  # Enum for invalid
            justification = "LLM returned an invalid classification string."

        return classification, justification

    except json.JSONDecodeError:
        return ADClassification.INVALID_JSON, "Invalid JSON response from LLM"  # Enum
    except Exception as e:
        return ADClassification.ERROR, f"An error occurred: {e}"  # Enum


# ... (pdf_directory and prompt - same as before)

# Loop and classify
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(pdf_directory, filename)
        classification, justification = classify_pdf(file_path, prompt)
        print(f'{filename}: {classification.name}')  # Print enum name
        if justification:
            print(f'  Justification: {justification}')
        print("-" * 20)