# Vision API Application for OCR

In [None]:
!apt-get install -y poppler-utils

In [None]:
!pip install -q cohere tiktoken openai transformers pdf2image Pillow

In [13]:
from openai import OpenAI

from google.colab import userdata
OPENAI_KEY = userdata.get('openai_key')

from pdf2image import convert_from_path
from PIL import Image
import base64

client = OpenAI(api_key=OPENAI_KEY)

In [19]:
# Convert PDF to PNG
def pdf_to_png(pdf_file, output_folder):
    pages = convert_from_path(pdf_file, dpi=300)  # Use a good DPI for better resolution
    images = []
    for i, page in enumerate(pages):
        output_path = f"{output_folder}/{pdf_file}_{i+1}.png"
        page.save(output_path, 'PNG')
        images.append(output_path)
    return images

def perform_OCR(prompt, image_path, model="gpt-4o"):
  with open(image_path, "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

  response = client.chat.completions.create(
    model=model,
    messages=[
      {
        "role": "user",
        "content": [
          {"type": "text", "text": prompt},
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{encoded_image}",
            },
          },
        ],
      }
    ],
    max_tokens=500,
  )

  return response.choices[0].message.content

## Case to read pdf files

In [None]:
# Convert PDF to image (.png)
image_path = 'sample.pdf'
images = pdf_to_png(image_path, '.')

print(images)

In [None]:
prompt = """写真から以下の情報を抽出しなさい。
- 請求日
- 請求先名
- 請求元名
- ”小計”項目の金額
- ”消費税”項目の金額
- ”請求額”項目の金額
- 支払期限の日付"""

for image_path in images:
  print(perform_OCR(prompt, image_path))

# Case to read image files directly

In [None]:
image2 = 'Screenshot 2023-08-14 at 5.38.23 PM.png'

prompt = """Extract the following information from the image.
- Order #
- Order date
- Recipient name
- Shipping address
- Item Details
- Total Merchandise
- Order total
"""

print(perform_OCR(prompt, image2))

In [None]:
image2 = 'Screenshot 2024-06-09 at 5.35.39 PM.png'

prompt = """Extract the following information from the image.
- Origin airport
- Destination airport
- Number of passengers
- Duration of departing fight
- Departure time of departing flight
- Arrival time of departing flight
- Flight number of departing flight
- Duration of returning flight
- Departure time of returning flight
- Arrival time of returning flight
- Flight number of returning flight
- Lowest total price of all flights
- Price of Economy Fully Refundable
"""

print(perform_OCR(prompt, image2))