In [1]:
import google.generativeai as genai

In [2]:
genai.configure(api_key="API_Key")

In [3]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [4]:
MODEL_CONFIG = {
  "temperature": 0.2,
  "top_p": 1,
  "top_k": 32,
  "max_output_tokens": 4096,
}

safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  }
]

In [5]:
model = genai.GenerativeModel(model_name = "gemini-pro-vision",
                              generation_config = MODEL_CONFIG,
                              safety_settings = safety_settings)

In [6]:
from pathlib import Path

def image_format(image_path):
    img = Path(image_path)

    if not img.exists():
        raise FileNotFoundError(f"Could not find image: {img}")

    image_parts = [
        {
            "mime_type": "image/png",
            "data": img.read_bytes()
        }
    ]
    return image_parts

In [7]:
def gemini_output(image_path, system_prompt, user_prompt):

    image_info = image_format(image_path)
    input_prompt= [system_prompt, image_info[0], user_prompt]
    response = model.generate_content(input_prompt)
    return response.text

In [8]:

system_prompt = """
               You are a specialist in comprehending receipts.
               Input images in the form of receipts will be provided to you,
               and your task is to respond to questions based on the content of the input image.
               """

image_path = "Food_Bills_With_Texts/0003.jpg"

user_prompt = "What is the balance amount in the image?"

gemini_output(image_path, system_prompt, user_prompt)

' The balance amount is 600.'

In [26]:
system_prompt = """
               You are a specialist in comprehending receipts.
               Input images in the form of receipts will be provided to you,
               and your task is to respond to questions based on the content of the input image.
               """
image_path = "Conveyance_Invoices/uber_invoice2.jpg"
user_prompt = """
Convert Invoice data into json format with appropriate json tags as required for the data in image,Extract all this information if and only if they exist other wise write NAN
1. Company Name
2. Invoice date
3. Invoice number
4. Total Amount
5. Hotel state
6. Hotel city
7. Hotel pin
8. Guest name
9. Guest's company name
10. GST Number of the guest's company
11. Checkin date
12. Checkout date
13. Total days stayed
14. Hotel service line items
"""


In [27]:
output = gemini_output(image_path, system_prompt, user_prompt)

In [28]:
from IPython.display import Markdown
Markdown(output)

 ```json
{
  "Company Name": "Uber",
  "Invoice date": "11 May 2023",
  "Invoice number": "HR2305085031000199",
  "Mode of travel": "Cab",
  "Travel ticket class": "UberX",
  "From location": "27A, near Girish Park, Manicktala, Azad Hind Bag, Kolkata, West Bengal 700006, India",
  "To location": "NaN",
  "Departure date": "11 May 2023",
  "Departure time": "09:39:21",
  "Arrival date": "11 May 2023",
  "Arrival time": "09:39:21",
  "Number of Kilometer (traveled)": "NaN",
  "Inter-city or intra-city travel": "Intra-city",
  "Total Amount": "105.99"
}
```