In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import fitz  # PyMuPDF
import pandas as pd
import openai
from PIL import Image
from io import BytesIO
import base64

# Set up the OpenAI client
client = openai.OpenAI(api_key="")

# Model name
MODEL = "gpt-4o"

# Function to convert a PDF page to an image
def extract_image_from_pdf_page(page):
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img

# Function to classify the document using GPT-4 without vision capabilities
def classify_document(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    buffered.seek(0)
    base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that classifies document images. Your task is to categorize each document page image into one of the specified categories."},
            {"role": "user", "content": [
                {"type": "text", "text": "Classify this document page image into one of the following categories: permit, air waybill, bill of lading, invoice, packaging list, unknown. Just output the type and nothing else."},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
            ]}
        ],
        temperature=0.0,
    )

    classification = response.choices[0].message.content
    print(classification)  # to see
    return classification

# Path to the PDF file
pdf_file = '/content/Sample.pdf'

# List to store classification results
results = []

# Open the PDF file
document = fitz.open(pdf_file)

prev_classification = None
start_page = 1

# Iterate over all pages in the PDF
for page_number in range(len(document)):
    page = document[page_number]
    image = extract_image_from_pdf_page(page)
    classification = classify_document(image)

    if classification != prev_classification and prev_classification is not None:
        results.append((pdf_file, prev_classification, start_page, page_number))
        start_page = page_number + 1

    prev_classification = classification

# Append the last document group
results.append((pdf_file, prev_classification, start_page, page_number + 1))

# Create a DataFrame and save to CSV
df = pd.DataFrame(results, columns=['Filename', 'Document Type', 'Start Page', 'End Page'])
df.to_csv('classification_report.csv', index=False)

# Print the counts
counts = df['Document Type'].value_counts()
for category, count in counts.items():
    print(f'{category}: {count}')


air waybill
air waybill
air waybill
invoice
packaging list
invoice
invoice
invoice
invoice
invoice
packaging list
packaging list
packaging list
packaging list
packaging list
bill of lading
bill of lading
invoice: 2
packaging list: 2
air waybill: 1
bill of lading: 1
