<a href="https://colab.research.google.com/github/aparna181/Gen_ai-1/blob/main/gen_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data extraction project from png files using MultiModal LLM

In [1]:
%%capture
!pip install -q -U google-generativeai    #Installing google-generativeai

In [2]:
import google.generativeai as genai # Importing the library that was installed

In [3]:
# Let's configure the gemini key
# setup api key
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('gemini_key')

genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
# list of models from google genai
for model in genai.list_models():
    # if 'generateContent' in model.supported_generation_methods:
    #   print(model.name)
    print(model)

Model(name='models/chat-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 Chat (Legacy)',
      description='A legacy text-only model optimized for chat conversations',
      input_token_limit=4096,
      output_token_limit=1024,
      supported_generation_methods=['generateMessage', 'countMessageTokens'],
      temperature=0.25,
      max_temperature=None,
      top_p=0.95,
      top_k=40)
Model(name='models/text-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 (Legacy)',
      description='A legacy model that understands text and generates text as an output',
      input_token_limit=8196,
      output_token_limit=1024,
      supported_generation_methods=['generateText', 'countTextTokens', 'createTunedTextModel'],
      temperature=0.7,
      max_temperature=None,
      top_p=0.95,
      top_k=40)
Model(name='models/embedding-gecko-001',
      base_model_id='',
      version='001',
      display_name='Embedding Gecko

In [5]:
# Model Configuration
MODEL_CONFIG = {
  "temperature": 0.2, # 0 -1 # we set creativness of the model in generating a response
  "top_p": 1, # ensures the cumulatively probability is 1
  "top_k": 32, # will consider top 32 tokens with highest probability
  "max_output_tokens": 4096, # maximum output tokens generated by the model
}

## Safety Settings of Model
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  }
]

In [7]:
# LOAD GEMINI MODEL WITH MODEL CONFIGURATIONS
model = genai.GenerativeModel(model_name = "gemini-1.5-flash",
                              generation_config = MODEL_CONFIG,
                              safety_settings = safety_settings)

In [8]:
# DEFINE IMAGE FORMAT TO INPUT IN GEMINI
from pathlib import Path

def image_format(image_path): # /content/invoice.png
    img = Path(image_path)

    if not img.exists():
        raise FileNotFoundError(f"Could not find image: {img}")

    image_parts = [
        {
            "mime_type": "image/png", ## Mime type are PNG - image/png. JPEG - image/jpeg. WEBP - image/webp
            "data": img.read_bytes()
        }
    ]
    return image_parts

In [9]:
# Gemini pro model output
def gemini_output(image_path, system_prompt, user_prompt):

    image_info = image_format(image_path) # [number] # number -image path is being sent to image_format function we created above that will give me output in terms of bytes
    input_prompt= [system_prompt, image_info[0], user_prompt]
    response = model.generate_content(input_prompt)
    return response.text

In [10]:
system_prompt = """
               You are a specialist in comprehending receipts.
               Input images in the form of receipts will be provided to you,
               and your task is to respond to questions based on the content of the input image.
               """

image_path = "/content/invoice.png"

user_prompt = "What is the invoice date?"

gemini_output(image_path, system_prompt, user_prompt)

'The invoice date is 12/08/2011.'

In [11]:
system_prompt = """
               You are a specialist in comprehending receipts.
               Input images in the form of receipts will be provided to you,
               and your task is to respond to questions based on the content of the input image.
               """

image_path = "/content/invoice.png"

user_prompt = "What is the total amount?"

gemini_output(image_path, system_prompt, user_prompt)

'The total amount is 901.80 EUR.'

In [12]:
system_prompt = """
               You are a specialist in comprehending receipts.
               Input images in the form of receipts will be provided to you,
               and your task is to respond to questions based on the content of the input image.
               """
#system_prompt = "Convert Invoice data into json format with appropriate json tags as required for the data in image "
image_path =  "/content/handwritten.png"
user_prompt = "Convert Invoice data into json format with appropriate json tags as required for the data in image "
output = gemini_output(image_path, system_prompt, user_prompt)

print(output)

```json
{
  "supplier": {
    "name": "RedmineCRM",
    "address": "Company representative name\nYour company address",
    "tax_id": null, 
    "phone": null,
    "fax": null
  },
  "client": {
    "name": "\"Romashka\" Ltd.",
    "address": "1600 Amphitheatre Parkway Mountain View, CA 94043"
  },
  "invoice_number": "INV/20111209-22",
  "invoice_date": "12/08/2011",
  "due_date": "12/25/2012",
  "items": [
    {
      "item_number": 1,
      "description": "Projecting\n- Context menu for invoices list",
      "quantity": 1.0,
      "unit": "hours",
      "unit_price": 50.00,
      "total": 50.00
    },
    {
      "item_number": 2,
      "description": "Develop\n- Invoice number format template\n- [PRO] Duplicating invoices\n- Language support\n- Context menu for invoices list",
      "quantity": 17.0,
      "unit": "hours",
      "unit_price": 40.00,
      "total": 680.00
    },
    {
      "item_number": 3,
      "description": "Analysis\n- [PRO] Duplicating invoices\n- Language su

In [13]:
# Directory for temporary uploaded files
import os
UPLOAD_DIR = "/tmp/uploads"

# Ensure the upload directory exists
os.makedirs(UPLOAD_DIR, exist_ok=True)

In [14]:
# Gradio function that processes the image and user prompt
def process_receipt(image,user_prompt):
    # Define the path to save the uploaded image
    image_path = os.path.join(UPLOAD_DIR, "uploaded_receipt.png")
    image.save(image_path)
    system_prompt = """
               You are a specialist in comprehending receipts.
               Input images in the form of receipts will be provided to you,
               and your task is to respond to questions based on the content of the input image.
               """

    # Call the gemini_output function
    output = gemini_output(image_path, system_prompt, user_prompt)


    return output

In [15]:
%%capture
pip install gradio

In [16]:
# Create Gradio interface
import gradio as gr
with gr.Blocks() as app:
    gr.Markdown("## Receipt Data Extraction")
    gr.Markdown("Upload a receipt image and provide a custom prompt for extracting information.")

    with gr.Row():
        image_input = gr.Image(label="Upload Receipt Image", type="pil")
        user_prompt_input = gr.Textbox(label="User Prompt", placeholder="E.g., 'Convert invoice data to JSON format'")

    output_display = gr.Textbox(label="Output")

    # Create a button to trigger the processing function
    submit_button = gr.Button("Process Receipt")

    # Set up event handling
    submit_button.click(fn=process_receipt,
                        inputs=[image_input, user_prompt_input],
                        outputs=output_display)

# Launch the app
app.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4813793171774fa73a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://4813793171774fa73a.gradio.live




In [17]:
! git clone https://github.com/aparna181/Gen_ai-1.git

Cloning into 'Gen_ai-1'...


In [18]:
%cd Gen_ai-1

/content/Gen_ai-1


In [42]:
!git checkout -b feature-branch

Switched to a new branch 'feature-branch'


In [50]:
!echo "# Gen_ai-1" >> README.md


In [51]:
!git init


Reinitialized existing Git repository in /content/Gen_ai-1/.git/


In [52]:
!git add README.md
