In [17]:
import time
import glob
import os
import csv
import base64
from openai import OpenAI
from pdf2image import convert_from_path
import pytesseract
import re

# Initialize the OpenAI client outside the function

client = OpenAI(api_key="")

# Function to clean the output folder before generating new images
def clean_output_folder(output_folder):
    file_list = glob.glob(os.path.join(output_folder, '*'))
    for file_path in file_list:
        os.remove(file_path)

# Function to convert PDF to images
def convert_pdf_to_images(pdf_path, output_folder):
    file_name = os.path.basename(pdf_path)
    file_name_no_extension = os.path.splitext(file_name)[0]
    
    images = convert_from_path(pdf_path, output_folder=output_folder, fmt='png')

    if len(images) == 1:  # Check if only one page in the PDF
        image_path = os.path.join(output_folder, f"{file_name_no_extension}_page_1.png")
        images[0].save(image_path, "PNG")
        return [image_path]

    # Save images with expected filenames based on PDF name and page number
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"{file_name_no_extension}_page_{i+1}.png")
        image.save(image_path, "PNG")  # Save the image with the expected filename
        image_paths.append(image_path)
    
    return image_paths

# Function to extract text from an image using pytesseract
def extract_text_from_image(image_path):
    if os.path.exists(image_path):
        return pytesseract.image_to_string(image_path)
    else:
        print(f"Error: File not found at path: {image_path}")
        return ""

# Function to process text and classify based on regex patterns
def process_text(extracted_text):
    # Define regex patterns
    invoice_pattern = r'\b(Commercial\sInvoice|Invoice)\b'
    permit_pattern = r'\bCARGO\sCLEARANCE\sPERMIT\b'
    packaging_list_pattern = r'\b(PACKING\sLIST|MANIFEST)\b'
    bill_of_lading_pattern = r'\bBILL\sOF\sLADING\b'
    air_waybill_pattern = r'\bAIR\sWAYBILL\b'
    booking_confirmation_pattern = r'\bBOOKING\sCONFIRMATION\b'
    delivery_order_pattern = r'\bDELIVERY\sORDER\b'

    
    output_data = {}

    text = " --- " + extracted_text + " --- " 
    pre_prompt = " You are given a document text delimited by hyphens {0} Use the document text to answer the query.".format(text)
    output_data['PrePrompt'] = pre_prompt

    if re.search(permit_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Permit'
        output_data['Prompt'] = "Output a list of values for the following keys. Values should be separated by semicolon. Keys are: Permit Number, Message Type, Declaration Type, Importer details, Validity period, Exporter details, Handling Agent, Port of Loading/Next port of call, Port of Discharge/Final Port of call, Country of final destination, IN TRANSPORT IDENTIFIER, OU TRANSPORT IDENTIFIER, Outward carrier agent, Inward carrier agent, Conveyance reference Number, OBL/MAWB NO, ARRIVAL DATE, DEPARTURE DATE, CERTIFICATE No, PLACE OF RELEASE, PLACE OF RECEIPT, LICENCE NO, CUSTOMS PROCEDURE CODE (CPC), HS codes of all items, IN HAWB/HUCR/HBL of all items, OUT HAWB/HUCR/HBL of all items, Quantities of all items, Invoice number, Job number, Ref number, Name of company, Declarant name, Declarant code. For a key for which no value is extracted - return value as ZZZZZZ. Strictly only output a list of values for all the given keys separated by semicolon"
    elif re.search(air_waybill_pattern, extracted_text, re.IGNORECASE):  
        output_data['FileType'] = 'Air Way Bill'
        output_data['Prompt'] = "Output a list of values for the following keys. Values should be separated by semicolon. Keys are:  Airwaybill / AWB number, Shipper name, Consignee name, Issuer details, Shipper account number, Consignee account number, Agent IATA code, Departure airport, Destination airport, Declared value, Invoice number, Invoice date, Sb number, Sb date, HS code / HSN code, Weight, Dimensions/measurements, Hawb / hbl number,Payment terms, Items/goods description. For a key for which no value is extracted - return value as ZZZZZZ. Strictly only output a list of values for all the given keys separated by semicolon"
    elif re.search(bill_of_lading_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Bill Of Lading'
        output_data['Prompt'] = "Output a list of values for the following keys. Values should be separated by semicolon. Keys are:  Bill of Lading Number, Shipper Details, Consignee Details, Agent details (Logistics partner), Port of Loading, Port of Discharge, Ocean Vessel number/name, Voy. no, Number of pkgs, Items/goods description, weight, Measurement/dimensions, HS code / HSN code, Invoice number, Payment terms, Date of shipping, Place of issue, Date of issue. For a key for which no value is extracted - return value as ZZZZZZ. Strictly only output a list of values for all the given keys separated by semicolon"
    elif re.search(invoice_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Invoice'
        output_data['Prompt'] = "Output a list of values for the following keys. Values should be separated by semicolon. Keys are: Customer details, Ship To / destination, Invoice Number, date of invoice, payment terms, Currency amount, Items, HS codes, BL number/Bill of lading number, Weight. For a key for which no value is extracted - return value as ZZZZZZ. Strictly only output a list of values for all the given keys separated by semicolon"
    elif re.search(packaging_list_pattern, extracted_text, re.IGNORECASE):
        output_data['FileType'] = 'Packaging List'
        output_data['Prompt'] = "Output a list of values for the following keys. Values should be separated by semicolon. Keys are:  Invoice Number, Items, No of cartons, dimensions. For a key for which no value is extracted - return value as ZZZZZZ. Strictly only output a list of values for all the given keys separated by semicolon"
    else:
        output_data['FileType'] = 'Unknown'
    return output_data


# Function to analyze images using OpenAI GPT
def analyze_images(openai_client, prompt, pre_prompt):

    messages = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": pre_prompt,
                }

            ]
            },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
            ]
        }
    ]

    response = openai_client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        max_tokens=300,
    )

    return [resp.message.content for resp in response.choices]

# Function to process PDF, extract text, classify, and further process using GPT
max_token_limit = 32000
def process_pdf(pdf_path, openai_client):
    start_time = time.time()
    
    images = convert_pdf_to_images(pdf_path, output_image_folder)
    
    extracted_text = ""
    for image_path in images:
        extracted_text += extract_text_from_image(image_path)

    # Check if the total token count exceeds the limit
    if len(extracted_text.split()) > max_token_limit:
        print(f"Skipping document {os.path.basename(pdf_path)} due to excessive length.")
        return None

    classified_data = process_text(extracted_text)

    # Only check permits, AWB and Invoice
    check_types = ['Permit', 'Bill Of Lading', 'Packaging List', 'Invoice']
    if classified_data['FileType'] in check_types:
        extracted_content = analyze_images(openai_client, classified_data['Prompt'], classified_data['PrePrompt'])
        classified_data['GPT_Response'] = extracted_content

        end_time = time.time()
        processing_time = end_time - start_time
        classified_data['Extracted_text'] = extracted_text
        classified_data['FileName'] = os.path.basename(pdf_path)
        classified_data['ProcessingTime'] = processing_time

        return classified_data
    else:
        print("Not Evaluating file {0} - type {1}".format(os.path.basename(pdf_path), classified_data['FileType']))
        return None


import csv
import os

def create_key_value_pairs(data, file_type, file_name):
    # Predefined keys for different document types
    predefined_keys = {
        'Invoice': ['Customer details', 'Ship To / destination', 'Invoice Number', 'date of invoice', 'payment terms', 'Currency', 'Items', 'HS codes','BL number/Bill of lading number','Weight'],
        'Permit': ['Permit Number', 'Message Type', 'Declaration Type', 'Importer details', 'Validity period', 'Exporter details', 'Handling Agent', 'Port of Loading/Next port of call', 'Port of Discharge/Final Port of call', 'Country of final destination', 'IN TRANSPORT IDENTIFIER', 'OU TRANSPORT IDENTIFIER', 'Outward carrier agent', 'Inward carrier agent', 'Conveyance reference Number', 'OBL/MAWB NO', 'ARRIVAL DATE', 'DEPARTURE DATE', 'CERTIFICATE No', 'PLACE OF RELEASE', 'PLACE OF RECEIPT', 'LICENCE NO', 'CUSTOMS PROCEDURE CODE (CPC)', 'HS codes of all items', 'IN HAWB/HUCR/HBL of all items', 'OUT HAWB/HUCR/HBL of all items', 'Quantities of all items', 'Invoice number', 'Job number', 'Ref number', 'Name of company', 'Declarant name', 'Declarant code'],
        'Packaging List': ['Invoice Number', 'Items', 'No of cartons', 'dimensions'],
        'Bill Of Lading': ['Bill of Lading Number', 'Shipper Details', 'Consignee Details', 'Agent details (Logistics partner)', 'Port of Loading', 'Port of Discharge', 'Ocean Vessel number/name', 'Voy. no', 'Number of pkgs', 'Items/goods description', 'weight', 'Measurement/dimensions', 'HS code / HSN code', 'Invoice number', 'Payment terms', 'Date of shipping', 'Place of issue', 'Date of issue'],
        'Air Way Bill': ['Airwaybill / AWB number', 'Shipper name', 'Consignee name', 'Issuer details', 'Shipper account number', 'Consignee account number', 'Agent IATA code', 'Departure airport', 'Destination airport', 'Declared value', 'Invoice number', 'Invoice date', 'Sb number', 'Sb date', 'HS code / HSN code', 'Weight', 'Dimensions/measurements','Hawb / hbl number','Payment terms','Items/goods description'],
    }

    # Prepare data as a dictionary for writing to CSV
    file_data = {'File Name': file_name}

    # Split the data string into values based on semicolons
    values = data.split(';')

    # Determine the predefined keys for the file type
    extracted_keys = predefined_keys.get(file_type, [])

    # Iterate over the extracted keys and values
    for i, key in enumerate(extracted_keys):
        value = values[i] if i < len(values) else 'NA'  # Get value or set 'NA' if not found
        file_data[key] = value

    return file_data

# Function to analyze permit and add additional fields
def analyze_permit(extracted_data, extracted_text):
    if extracted_data['IN HAWB/HUCR/HBL of all items'] == 'ZZZZZZ':
        hs_codes = []
        hawb_numbers = []
        j = 0
        # Some files do not have HAWB/HUCR/HBL Number
        # This flag is used to check if the number is present
        hawb_exists = False
        text = extracted_text
        # Split the text into lines and remove empty lines
        text = text.split('\n')
        text = [line for line in text if line != '']
        for li,line in enumerate(text):
            # Check if HAWB/HUCR/HBL Number is present
            if line.startswith('IN HAWB/HUCR/HBL'):
                hawb_exists = True
            try:
                # Change the range to accomodate more HS Codes
                for i in range(j,10):
                    if (line.split()[0] == '0'+str(i)):
                        hs_codes.append(line.split()[1])
                        # To avoid duplicates
                        j = i+1
                        if hawb_exists:
                            hawb_numbers.append(text[li+2])
            except:
                pass
            if line == 'TRADER’ S REMARKS':
                pattern = re.compile(r'INV(?:OICE)?[ #]*:?[ \t]*([A-Za-z0-9/-]+)')
                matches = re.findall(pattern, text[li+1])
                if matches:
                    extracted_data['Invoice Number'] = text[li+1].split()[-1]
                else:
                    extracted_data['Invoice Number'] = None
        # Extract unique values
        hs_codes = list(set(hs_codes))
        hawb_numbers = list(set(hawb_numbers))
        extracted_data['HS codes of all items'] = hs_codes
        extracted_data['OUT HAWB/HUCR/HBL of all items'] = hawb_numbers
    return extracted_data

def analyze_AWB(extracted_data,extracted_text):
    if extracted_data['Airwaybill / AWB number'] == "ZZZZZZ":
        # Define the regex pattern
        patterns = [
            r'\b\d{3}-\d{8}\b',                                      # Pattern 1
            r'\b\d{3}\. \| [A-Z]+ \| \d{4} \d{4}\b',                 # Pattern 2
            r'\b\d{3} DEL\| \d{4}-\d{4}\b',                          # Pattern 3
            r'\b\d{3} DEL\| \d{4} [A-Z]{3}-\d{4}\b',                 # Pattern 4
            r'\b\d{3} \d{8}\b',                                      # Pattern 5
            r'\b\d{11}\b',                                           # Pattern 6
            r'\b\d{3} \|DEL\| \d{4}-\d{4}\b'
        ]
        matched_hawb = ""
        # Extract AWB numbers using regex
        for pattern_idx, pattern in enumerate(patterns, start=1):
            match = re.search(pattern, extracted_text)
            if match:
                matched_hawb = match.group()
                break
        if matched_hawb == "":
            return None
        else:
            # Extract numbers only
            matched_hawb = re.sub(r'\D', '', matched_hawb)
            extracted_data['Airwaybill / AWB number'] = matched_hawb
    return extracted_data  

def save_data_to_csv(data, file_type, file_name,extracted_text,output_directory):
    output_file = os.path.join(output_directory, f'{file_type}_Output.csv')

    # Create key-value pairs from the data, including the file name
    file_data = create_key_value_pairs(data, file_type, file_name)

    if file_type == "Permit":
        file_data = analyze_permit(file_data,extracted_text)
    elif file_type == "Air Way Bill":
        file_data = analyze_AWB(file_data,extracted_text)
    print("Final Data:", file_data)

    # Check if the file already exists, if not create a new file and write the data
    write_header = not os.path.exists(output_file)
    with open(output_file, 'a', newline='') as csvfile:
        fieldnames = list(file_data.keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()
        writer.writerow(file_data)

# Dictionary to hold classified data for different document types
classified_data_by_type = {
    'Invoice': [],
    'Permit': [],
    'Packaging List': [],
    'Bill Of Lading': [],
    'Air Way Bill': [],
}

# Dictionary to store processing times for each document type
processing_times_by_type = {
    'Invoice': [],
    'Permit': [],
    'Packaging List': [],
    'Bill Of Lading': [],
    'Air Way Bill': [],
}

all_outputs = []

# Creating the JOBS folder
job_folder = '/Users/arup/Documents/IDP/SEA_IMPORT'
output_folder = 'output_import/'
pdf_file_folders = os.listdir(job_folder)
output_image_folder = 'output_images/'
# Iterate over the folders in the JOBS folder
for folder in pdf_file_folders:
    pdf_folder = os.path.join(job_folder,folder)
    pdf_files = glob.glob(pdf_folder+'/*pdf')
    print("\n\nFOLDER - {}".format(pdf_folder))
    print(pdf_files)
    # Create an output folder for each folder in the JOBS folder
    output_sub_folder = os.path.join(output_folder,folder)
    if not os.path.exists(output_sub_folder):
        os.makedirs(output_sub_folder)
    else:
        continue
    # Iterate over the PDF files in the folder
    for pdf_file in pdf_files:
        print(pdf_file)

        clean_output_folder(output_image_folder)  # Clean the output folder before processing new PDF

        result = process_pdf(pdf_file, client)
        if result:
            extracted_text = result['Extracted_text']
            all_outputs.append(result)

            file_type = result['FileType']
            gpt_responses = result['GPT_Response']  # Get the list of GPT responses
            for response in gpt_responses:
                save_data_to_csv(response, file_type, os.path.splitext(os.path.basename(pdf_file))[0],extracted_text,output_sub_folder)
                print(f"CSV data for {os.path.basename(pdf_file)} saved for type: {file_type}")

            # Append data to respective document type in the dictionary
            classified_data_by_type[file_type].append(result['GPT_Response'])
            processing_times_by_type[file_type].append(result['ProcessingTime'])

    # Calculate and display average processing time for each document type
    for doc_type, times_list in processing_times_by_type.items():
        if times_list:
            avg_time = sum(times_list) / len(times_list)
            print(f"Average Processing Time for {doc_type}: {avg_time} seconds")



FOLDER - /Users/arup/Documents/IDP/SEA_IMPORT/SICSY2309001500
['/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2309001500/SICSY2309001500 - CI.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2309001500/SICSY2309001500 - PL.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2309001500/IG3I049039A.pdf']


FOLDER - /Users/arup/Documents/IDP/SEA_IMPORT/.DS_Store
[]


FOLDER - /Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400
['/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400/SICSY2311002400 - CI 2.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400/SICSY2311002400 - CI 3.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400/SICSY2311002400 - CI 1.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400/SICSY2311002400 - IG PERMIT.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400/SICSY2311002400 - HBL.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400/SICSY2311002400 - MBL.pdf', '/Users/arup/Documents/IDP/SEA_IMPORT/SICSY2311002400/SICSY2311002400 - PL