In [18]:
import time
import glob
import os
import csv
import base64
from openai import OpenAI
from pdf2image import convert_from_path
import pytesseract
import re

# Initialize the OpenAI client outside the function
client = OpenAI(api_key="")

# Function to clean the output folder before generating new images
def clean_output_folder(output_folder):
    file_list = glob.glob(os.path.join(output_folder, '*'))
    for file_path in file_list:
        os.remove(file_path)

# Function to convert PDF to images
def convert_pdf_to_images(pdf_path, output_folder):
    file_name = os.path.basename(pdf_path)
    file_name_no_extension = os.path.splitext(file_name)[0]
    
    images = convert_from_path(pdf_path, output_folder=output_folder, fmt='png')

    if len(images) == 1:  # Check if only one page in the PDF
        image_path = os.path.join(output_folder, f"{file_name_no_extension}_page_1.png")
        images[0].save(image_path, "PNG")
        return [image_path]

    # Save images with expected filenames based on PDF name and page number
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"{file_name_no_extension}_page_{i+1}.png")
        image.save(image_path, "PNG")  # Save the image with the expected filename
        image_paths.append(image_path)
    
    return image_paths

# Function to extract text from an image using pytesseract
def extract_text_from_image(image_path):
    if os.path.exists(image_path):
        return pytesseract.image_to_string(image_path)
    else:
        print(f"Error: File not found at path: {image_path}")
        return ""

# Function to process text and classify based on regex patterns
def process_text(extracted_text):
    # Define regex patterns
    invoice_pattern = r'\b(Commercial\sInvoice|Invoice)\b'
    permit_pattern = r'\bCARGO\sCLEARANCE\sPERMIT\b'
    packaging_list_pattern = r'\bPACKING\sLIST\b'
    bill_of_lading_pattern = r'\bBILL\sOF\sLADING\b'
    
    output_data = {}

    if re.search(invoice_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Invoice'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are: Customer name, Destination name, Invoice number, Date, Payment terms, Currency, Due date, items, quantities, unit price, total amount. For a key for which no value is extracted - return value as NA."
    elif re.search(permit_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Permit'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are: permit number, importer details, exporter details, arrival date, departure date, and license number. For a key for which no value is extracted - return value as NA."
    elif re.search(packaging_list_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Packaging List'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are:  item and no of cartons. For a key for which no value is extracted - return value as NA."
    elif re.search(bill_of_lading_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Bill Of Lading'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are:  Bill number, port for release, shipper details, consignee details, port of loading, port of discharge, final destination, voyage number, and container number. For a key for which no value is extracted - return value as NA."
    else:
        output_data['FileType'] = 'Unknown'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. If document is an invoice, keys are: Customer name, Destination name, Invoice number, Date, Payment terms, Currency, Due date, items, quantities, unit price, total amount. If it is a Bill of Lading list, keys are: Bill number, port for release, shipper details, consignee details, port of loading, port of discharge, final destination, voyage number, and container number. If it is a permit, keys are: permit number, importer details, exporter details, arrival date, departure date, and license number. If it is a packaging list, keys are: item and no of cartons. For a key for which no value is extracted - return value as NA. If the type of document is not invoice/packaging list/bill of lading/permit, assume it is an invoice."

    return output_data

# Function to encode an image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Function to analyze images using OpenAI GPT
def analyze_images(image_paths, openai_client, prompt):
    base64_images = [encode_image(image_path) for image_path in image_paths]

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{','.join(base64_images)}",
                    },
                },
            ],
        }
    ]

    response = openai_client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=messages,
        max_tokens=300,
    )

    return [resp.message.content for resp in response.choices]

# Function to process PDF, extract text, classify, and further process using GPT
def process_pdf(pdf_path, openai_client):
    start_time = time.time()
    
    images = convert_pdf_to_images(pdf_path, output_image_folder)
    
    extracted_text = ""
    for image_path in images:
        extracted_text += extract_text_from_image(image_path)
    
    classified_data = process_text(extracted_text)
    
    extracted_content = analyze_images(images, openai_client, classified_data['Prompt'])
    classified_data['GPT_Response'] = extracted_content

    end_time = time.time()
    processing_time = end_time - start_time
    
    classified_data['FileName'] = os.path.basename(pdf_path)
    classified_data['ProcessingTime'] = processing_time

    print(classified_data['GPT_Response'])
    return classified_data

def save_gpt_response_to_csv(gpt_response, output_file):
    # Process and extract key-value pairs
    extracted_data = {}
    for item in gpt_response:
        pairs = item.split('\n')
        for pair in pairs:
            key_value = pair.split(':')
            if len(key_value) == 2:
                key = key_value[0].strip()
                value = key_value[1].strip()
                extracted_data[key] = value
            else:
                extracted_data['NA'] = pair.strip()  # If not in 'key: value' format, store it as 'NA'

    # Writing extracted data to CSV
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Key', 'Value'])
        for key, value in extracted_data.items():
            writer.writerow([key, value])

# Set your PDF directory path
pdf_folder = '/Users/arup/Documents/Test/*.pdf'
pdf_files = glob.glob(pdf_folder)

output_image_folder = '/Users/arup/Documents/Test/Images/'  # Change this to your desired output folder

# Inside the loop that processes each PDF file
output_directory = '/Users/arup/Documents/Test/CSV/'  # Change this to your desired directory

# List to hold processing times of classified PDFs
classified_processing_times = []

# Loop through PDFs, process and classify, and further process using GPT if classified
for pdf_file in pdf_files:
    clean_output_folder(output_image_folder)  # Clean the output folder before processing new PDF
    
    result = process_pdf(pdf_file, client)
    all_outputs.append(result)

    classified_processing_times.append(result['ProcessingTime'])

    # Define the output file path for the GPT response CSV
    output_file = os.path.join(output_directory, f'{os.path.splitext(os.path.basename(pdf_file))[0]}_GPT_Output.csv')
    
    # Save the GPT response to CSV
    save_gpt_response_to_csv(result['GPT_Response'], output_file)
    
    # Print a message indicating that the CSV file has been saved
    print(f"CSV file for {os.path.basename(pdf_file)} GPT output saved: {output_file}")


# Calculate and display average processing time for classified PDFs
if classified_processing_times:
    average_processing_time = sum(classified_processing_times) / len(classified_processing_times)
    print(f"Average Processing Time for classified files: {average_processing_time} seconds")


['Shipper Name: MILEAGE LOGISTICS PRIVATE LIMITED\nConsignee Name: ELECTRONICS MARITIME PRIVATE LIMITED\nInvoice Number: NA\nDate: 03/06/2023\nPayment Terms: Freight Prepaid\nCurrency: INR\nDue Date: NA\nItems: SUB-MARINE SURVEY EQUIPMENT\nQuantities: 14\nUnit Price: AS AGREED\nTotal Amount: AS AGREED']
CSV file for HAWB.pdf GPT output saved: /Users/arup/Documents/Test/CSV/HAWB_GPT_Output.csv
Average Processing Time for classified files: 10.288441181182861 seconds
