In [1]:
import time
import re
import fitz  # PyMuPDF
import pytesseract  # for OCR
import openai  # OpenAI API
import csv
import glob
import os

# Authenticate with OpenAI API
openai.api_key = ""

def check_pdf_type(pdf_path):
    pdf_document = fitz.open(pdf_path)
    has_image = False

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        if not page.get_text("text"):
            has_image = True
            break

    pdf_document.close()
    
    if has_image:
        return "Image"
    else:
        return "Tagged"

def process_pdf(pdf_path):
    start_time = time.time()
    
    # Check if PDF is tagged or image
    pdf_type = check_pdf_type(pdf_path)
    
    # Extract text based on type (tagged or image)
    if pdf_type == "Tagged":
        pdf_document = fitz.open(pdf_path)
        extracted_text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            extracted_text += page.get_text()
        pdf_document.close()
    else:
        extracted_text = pytesseract.image_to_string(pdf_path)
    
    # Define regex patterns
    invoice_pattern = r'\b(BILL\sOF\sLADING|Commercial\sInvoice)\b'
    permit_pattern = r'\bCARGO\sCLEARANCE\sPERMIT\b'
    packaging_list_pattern = r'\bPACKING\sLIST\b'
    
    # Apply regex matching for classification and call ChatGPT
    output_data = {'FileName': os.path.basename(pdf_path)}
    
    if re.search(invoice_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Invoice'
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": "Extract relevant details in a dictionary format: Customer name, Destination name, Invoice number, Date, Payment terms, Currency, Due date, items, quantities, unit price, total amount. Don't give anything extra output.",
                },
                {"role": "user", "content": extracted_text},
            ],
        )
        output_data['GPT_Response'] = response['choices'][0]['message']['content'] if response['choices'][0]['message']['content'] else 'NA'
    
    elif re.search(permit_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Permit'
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": "Extract relevant details in a dictionary format: Bill number, port for release, shipper details, consignee details, port of loading, port of discharge, final destination, voyage number and container number. Don't give anything extra output",
                },
                {"role": "user", "content": extracted_text},
            ],
        )
        output_data['GPT_Response'] = response['choices'][0]['message']['content'] if response['choices'][0]['message']['content'] else 'NA'
    
    elif re.search(packaging_list_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Packaging List'
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": "Extract relevant details in a dictionary format: permit number, importer details, exporter details, arrival date, departure date and license number. Don't give anything extra output",
                },
                {"role": "user", "content": extracted_text},
            ],
        )
        output_data['GPT_Response'] = response['choices'][0]['message']['content'] if response['choices'][0]['message']['content'] else 'NA'
    
    end_time = time.time()
    processing_time = end_time - start_time
    output_data['ProcessingTime'] = processing_time
    
    return output_data, processing_time

# List to hold output data for all PDFs
all_outputs = []
total_time = 0

# Loop through your PDFs and call process_pdf function for each file
pdf_folder = '/Users/arup/Documents/Test/*.pdf'  # Replace this with your PDF directory path
pdf_files = glob.glob(pdf_folder)

for pdf_file in pdf_files:
    output_data, time_taken = process_pdf(pdf_file)
    all_outputs.append(output_data)
    total_time += time_taken

# Calculate average time
average_time = total_time / len(pdf_files)

print(f"Average time taken per PDF: {average_time} seconds")

# Process PDFs and save GPT responses in individual CSV files
for output_data in all_outputs:
    if 'GPT_Response' in output_data:
        # Create a CSV file for each PDF
        output_csv = f'/Users/arup/Documents/Test/{os.path.splitext(output_data["FileName"])[0]}_output.csv'
        
        with open(output_csv, "w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            if isinstance(output_data['GPT_Response'], str):
                # Convert the string response to a dictionary
                try:
                    gpt_dict = eval(output_data['GPT_Response'])
                    for key, value in gpt_dict.items():
                        writer.writerow([key, value])
                except Exception as e:
                    print(f"Error converting to dictionary: {e}")
            else:
                writer.writerow(['GPT_Response', output_data['GPT_Response']])

        print(f"Output saved to {output_csv}")
    else:
        print("No 'GPT_Response' found for this file.")


Average time taken per PDF: 4.0650985791133 seconds
Output saved to /Users/arup/Documents/Test/HBL SURR MIKASA 2306000601_output.csv
Output saved to /Users/arup/Documents/Test/SICSY2306002400 - IG PERMIT_output.csv
No 'GPT_Response' found for this file.
No 'GPT_Response' found for this file.
Output saved to /Users/arup/Documents/Test/PCK-2306010-PHAIC_output.csv
Output saved to /Users/arup/Documents/Test/Mbl_output.csv
Output saved to /Users/arup/Documents/Test/Permit-1_output.csv
Error converting to dictionary: unterminated string literal (detected at line 1) (<string>, line 1)
Output saved to /Users/arup/Documents/Test/SICSY2306002400 - PL_output.csv
No 'GPT_Response' found for this file.
Output saved to /Users/arup/Documents/Test/Permit_output.csv
No 'GPT_Response' found for this file.
Output saved to /Users/arup/Documents/Test/CIPL_output.csv
Output saved to /Users/arup/Documents/Test/SICSY2306002400 - HBL_output.csv
