In [29]:
import time
import glob
import os
import csv
import base64
from openai import OpenAI
from pdf2image import convert_from_path
import pytesseract
import re

# Initialize the OpenAI client outside the function
client = OpenAI(api_key="")

# Function to clean the output folder before generating new images
def clean_output_folder(output_folder):
    file_list = glob.glob(os.path.join(output_folder, '*'))
    for file_path in file_list:
        os.remove(file_path)

# Function to convert PDF to images
def convert_pdf_to_images(pdf_path, output_folder):
    file_name = os.path.basename(pdf_path)
    file_name_no_extension = os.path.splitext(file_name)[0]
    
    images = convert_from_path(pdf_path, output_folder=output_folder, fmt='png')

    if len(images) == 1:  # Check if only one page in the PDF
        image_path = os.path.join(output_folder, f"{file_name_no_extension}_page_1.png")
        images[0].save(image_path, "PNG")
        return [image_path]

    # Save images with expected filenames based on PDF name and page number
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"{file_name_no_extension}_page_{i+1}.png")
        image.save(image_path, "PNG")  # Save the image with the expected filename
        image_paths.append(image_path)
    
    return image_paths

# Function to extract text from an image using pytesseract
def extract_text_from_image(image_path):
    if os.path.exists(image_path):
        return pytesseract.image_to_string(image_path)
    else:
        print(f"Error: File not found at path: {image_path}")
        return ""

# Function to process text and classify based on regex patterns
def process_text(extracted_text):
    # Define regex patterns
    invoice_pattern = r'\b(Commercial\sInvoice|Invoice)\b'
    permit_pattern = r'\bCARGO\sCLEARANCE\sPERMIT\b'
    packaging_list_pattern = r'\bPACKING\sLIST\b'
    bill_of_lading_pattern = r'\bBILL\sOF\sLADING\b'
    air_waybill_pattern = r'\bAIR\sWAYBILL\b'
    
    output_data = {}

    if re.search(permit_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Permit'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are: Permit Number, Message Type, Declaration Type, Importer details, Validity period, Exporter details, Handling Agent, Port of Loading/Next port of call, Port of Discharge/Final Port of call, Country of final destination, IN TRANSPORT IDENTIFIER, OU TRANSPORT IDENTIFIER, Outward carrier agent, Inward carrier agent, Conveyance reference Number, OBL/MAWB NO, ARRIVAL DATE, DEPARTURE DATE, CERTIFICATE No, PLACE OF RELEASE, PLACE OF RECEIPT, LICENCE NO, CUSTOMS PROCEDURE CODE (CPC), HS codes of all items, IN HAWB/HUCR/HBL of all items, OUT HAWB/HUCR/HBL of all items, Quantities of all items, Invoice number, Job number, Ref number, Name of company, Declarant name, Declarant code. For a key for which no value is extracted - return value as NA."
    elif re.search(air_waybill_pattern, extracted_text, re.IGNORECASE):  
        output_data['FileType'] = 'Air Way Bill'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are:  Airwaybill number, Shipper name, Consignee name, Issuer details, Shipper account number, Consignee account number, Agent IATA code, Departure airport, Destination airport, Declared value, Invoice number, Invoice date, Sb number, Sb date, HS code / HSN code, Weight, Dimensions/measurements, Payment terms. For a key for which no value is extracted - return value as NA."    
    elif re.search(bill_of_lading_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Bill Of Lading'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are:  Bill of Lading Number, Shipper Details, Consignee Details, Agent details (Logistics partner), Port of Loading, Port of Discharge, Ocean Vessel number/name, Voy. no, Number of pkgs, Items/goods description, weight, Measurement/dimensions, HS code / HSN code, Invoice number, Payment terms, No of original Bill of lading or B/l, Date of shipping, Place of issue, Date of issue. For a key for which no value is extracted - return value as NA."
    elif re.search(invoice_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Invoice'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are: Customer details, Ship To / destination, Invoice Number, date of invoice, payment terms, Currency. For a key for which no value is extracted - return value as NA."
    elif re.search(packaging_list_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Packaging List'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. Keys are:  Invoice Number, Items, No of cartons, dimensions. For a key for which no value is extracted - return value as NA."
    else:
        output_data['FileType'] = 'Unknown'
        output_data['Prompt'] = "Strictly output only and only a list of key-value pairs from the document. If document is an invoice, keys are:Customer details, Ship To / destination, Invoice Number, date of invoice, payment terms, Currency. If it is a Bill of Lading list, keys are: Bill of Lading Number, Shipper Details, Consignee Details, Agent details (Logistics partner), Port of Loading, Port of Discharge, Ocean Vessel number/name, Voy. no, Number of pkgs, Items/goods description, weight, Measurement/dimensions, HS code / HSN code, Invoice number, Payment terms, No of original Bill of lading or B/l, Date of shipping, Place of issue, Date of issue. If it is a permit, keys are: Permit Number, Message Type, Declaration Type, Importer details, Validity period, Exporter details, Handling Agent, Port of Loading/Next port of call, Port of Discharge/Final Port of call, Country of final destination, IN TRANSPORT IDENTIFIER, OU TRANSPORT IDENTIFIER, Outward carrier agent, Inward carrier agent, Conveyance reference Number, OBL/MAWB NO, ARRIVAL DATE, DEPARTURE DATE, CERTIFICATE No, PLACE OF RELEASE, PLACE OF RECEIPT, LICENCE NO, CUSTOMS PROCEDURE CODE (CPC), HS codes of all items, IN HAWB/HUCR/HBL of all items, OUT HAWB/HUCR/HBL of all items, Quantities of all items, Invoice number, Job number, Ref number, Name of company, Declarant name, Declarant code. If it is a packaging list, keys are: Invoice Number, Items, No of cartons, dimensions. If it is an Air Way Bill, keys are: Airwaybill number, Shipper name, Consignee name, Issuer details, Shipper account number, Consignee account number, Agent IATA code, Departure airport, Destination airport, Declared value, Invoice number, Invoice date, Sb number, Sb date, HS code / HSN code, Weight, Dimensions/measurements, Payment terms. For a key for which no value is extracted - return value as NA. If the type of document is not invoice/packaging list/bill of lading/permit/air way bill, assume it is an invoice."

    return output_data


# Function to encode an image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Function to analyze images using OpenAI GPT
def analyze_images(image_paths, openai_client, prompt):
    base64_images = [encode_image(image_path) for image_path in image_paths]

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{','.join(base64_images)}",
                    },
                },
            ],
        }
    ]

    response = openai_client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=messages,
        max_tokens=300,
    )

    return [resp.message.content for resp in response.choices]

# Function to process PDF, extract text, classify, and further process using GPT
def process_pdf(pdf_path, openai_client):
    start_time = time.time()
    
    images = convert_pdf_to_images(pdf_path, output_image_folder)
    
    extracted_text = ""
    for image_path in images:
        extracted_text += extract_text_from_image(image_path)
    
    classified_data = process_text(extracted_text)
    
    extracted_content = analyze_images(images, openai_client, classified_data['Prompt'])
    classified_data['GPT_Response'] = extracted_content

    # Process and extract key-value pairs
    extracted_data = {}
    for item in extracted_content:
        pairs = item.split('\n')
        for pair in pairs:
            key_value = pair.split(':')
            if len(key_value) == 2:
                key = key_value[0].strip()
                value = key_value[1].strip()
                extracted_data[key] = value
            else:
                extracted_data['NA'] = pair.strip()  # If not in 'key: value' format, store it as 'NA'

    end_time = time.time()
    processing_time = end_time - start_time
    
    classified_data['ExtractedData'] = extracted_data
    classified_data['FileName'] = os.path.basename(pdf_path)
    classified_data['ProcessingTime'] = processing_time

    print(classified_data['ExtractedData'])
    return classified_data

# Function to save extracted data to CSV for a specific file type
def save_data_to_csv(data, file_type, file_name):
    # Define the output file path for the specific file type
    output_file = os.path.join(output_directory, f'{file_type}_Output.csv')

    # Update file_data dictionary with extracted keys and values
    file_data = {'File Name': file_name}
    file_data.update(data)

    # Check if the file already exists, if not create a new file and write the data
    write_header = not os.path.exists(output_file)
    with open(output_file, 'a', newline='') as csvfile:
        fieldnames = list(file_data.keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()
        writer.writerow(file_data)

# Set your PDF directory path
pdf_folder = '/Users/arup/Documents/Test/*.pdf'
pdf_files = glob.glob(pdf_folder)

output_image_folder = '/Users/arup/Documents/Test/Images/'  # Change this to your desired output folder

# Inside the loop that processes each PDF file
output_directory = '/Users/arup/Documents/Test/CSV/'  # Change this to your desired directory

# Dictionary to hold classified data for different document types
classified_data_by_type = {
    'Invoice': [],
    'Permit': [],
    'Packaging List': [],
    'Bill Of Lading': [],
    'Air Way Bill': [],
    'Unknown': []
}

# Dictionary to store processing times for each document type
processing_times_by_type = {
    'Invoice': [],
    'Permit': [],
    'Packaging List': [],
    'Bill Of Lading': [],
    'Air Way Bill': [],
    'Unknown': []
}

for pdf_file in pdf_files:
    clean_output_folder(output_image_folder)  # Clean the output folder before processing new PDF

    result = process_pdf(pdf_file, client)
    all_outputs.append(result)

    file_type = result['FileType']
    save_data_to_csv(result['ExtractedData'], file_type, os.path.splitext(os.path.basename(pdf_file))[0])

    print(f"CSV data for {os.path.basename(pdf_file)} saved for type: {file_type}")

    # Append data to respective document type in the dictionary
    classified_data_by_type[file_type].append(result['ExtractedData'])
    processing_times_by_type[file_type].append(result['ProcessingTime'])

# Calculate and display average processing time for each document type
for doc_type, times_list in processing_times_by_type.items():
    if times_list:
        avg_time = sum(times_list) / len(times_list)
        print(f"Average Processing Time for {doc_type}: {avg_time} seconds")


{'Airwaybill number': '112300060', 'Shipper name': 'MILEAGE LOGISTICS PRIVATE LIMITED', 'Consignee name': 'CONSORTIUM PTE LTD', 'Issuer details': 'MILEAGE LOGISTICS PRIVATE LIMITED, MUMBAI', 'Shipper account number': 'NA', 'Consignee account number': 'NA', 'Agent IATA code': '14-3-25844 0004', 'Departure airport': 'MUMBAI, BOM', 'Destination airport': 'SINGAPORE, SIN', 'Declared value': 'NCV', 'Invoice number': '6629/RE/2023-24', 'Invoice date': '29/05/2023', 'Sb number': '1466323', 'Sb date': '01/09/2023', 'HS code / HSN code': '90158090', 'Weight': '426.000 KGS', 'Dimensions/measurements': '7 × 120 X 25 X 27, 1 = 36 X 28 X 9, 1 = 60 X 60 X 35, 1 = 60 X 50 X 30, 1 = 80 X 50 X 53, 1 = 80 X 55 X 28, 1 = 79 X 54 X 36, 1 = 52 X 43 X 22.', 'Payment terms': 'NA'}
CSV data for HAWB.pdf saved for type: Air Way Bill
{'Customer details': 'PANASONIC APPLIANCES INDIA COMPANY LTD., Sholavaram Village, Ponneri Taluk, Chennai-600 067, India', 'Ship To / destination': 'CARGOPORT PTE LTD, BLK 519 KAMP