In [14]:
import time
import glob
import os
import csv
import base64
from openai import OpenAI
from pdf2image import convert_from_path

# Initialize the OpenAI client outside the function
client = OpenAI(api_key="")

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def analyze_image(image_path, openai_client):
    base64_image = encode_image(image_path)
    
    response = openai_client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Strictly output only and only a list of 'key'-'value' pairs from the document. the output should have double quotes in the keys and values. If document is an invoice, keys are: Customer name, Destination name, Invoice number, Date, Payment terms, Currency, Due date, items, quantities, unit price, total amount. If it is a Bill of Lading list, keys are: Bill number, port for release, shipper details, consignee details, port of loading, port of discharge, final destination, voyage number, and container number. If it is a permit, keys are: permit number, importer details, exporter details, arrival date, departure date, and license number. If it is a packaging list, keys are: item and no of cartons. For a key for which no value is extracted - return value as NA. If the type of document is not invoice/packaging list/bill of lading/permit, assume it is an invoice."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ],
        max_tokens=300,
    )
    
    return response.choices[0].message.content

def process_pdf(pdf_path, openai_client):
    start_time = time.time()
    
    images = []
    pages = convert_from_path(pdf_path, 300)
    
    pdf_directory = os.path.dirname(pdf_path)
    
    for i, page in enumerate(pages):
        img_path = os.path.join(pdf_directory, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i + 1}.png")
        page.save(img_path, format='PNG')
        images.append(img_path)
    
    extracted_content = []
    for image_path in images:
        result = analyze_image(image_path, openai_client)
        extracted_content.append(result)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    output_data = {
        'FileName': os.path.basename(pdf_path),
        'GPT_Response': extracted_content,
        'ProcessingTime': processing_time
    }

    return output_data

pdf_folder = '/Users/arup/Documents/Test/*.pdf'
pdf_files = glob.glob(pdf_folder)

total_processing_time = 0

for pdf_file in pdf_files:
    data_for_csv = []

    result = process_pdf(pdf_file, client)
    data_for_csv.append(result)
    total_processing_time += result['ProcessingTime']

    # Writing to CSV for the current PDF
    csv_filename = f'/Users/arup/Documents/Test/{os.path.splitext(os.path.basename(pdf_file))[0]}_output.csv'

    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Key', 'Value'])

        for data in data_for_csv:
            for response_str in data['GPT_Response']:
                response_list = [elem.strip() for elem in response_str.split('\n') if elem.strip()]
                for response_item in response_list:
                    # Split only if there is a colon in the response
                    if ':' in response_item:
                        key, value = response_item.split(':', 1)
                        writer.writerow([key.strip(), value.strip()])
                    else:
                        # If no colon, treat the entire response as the key and assign 'NA' as the value
                        writer.writerow([response_item.strip(), 'NA'])

        print(f"CSV file for '{pdf_file}' has been generated.")

# Calculating average processing time for all PDFs
average_processing_time = total_processing_time / len(pdf_files)
print(f"Average Processing Time for all files: {average_processing_time} seconds")


CSV file for '/Users/arup/Documents/Test/HBL SURR MIKASA 2306000601.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/SICSY2306002400 - IG PERMIT.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/HAWB.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/INV-2306010-PHAIC.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/PCK-2306010-PHAIC.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/Mbl.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/Permit-1.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/SICSY2306002400 - PL.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/SICSY2306002400 - MBL.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/Permit.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/SICSY2306002400 - CI.pdf' has been generated.
CSV file for '/Users/arup/Documents/Test/CIPL.pdf' has been generated.
CSV file for '/Users/ar