# Google's Gemini 2.0 

## Information Extraction: Tabular Data, continued

In [1]:
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

In [2]:
import enum
import json
import numpy as np
import pandas as pd

from google import genai
from google.genai import types

from pydantic import BaseModel, Field

In [3]:
# Create a client
client = genai.Client(api_key=GEMINI_API_KEY)

# Specify the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

----

In [4]:
with open("samples/saintmarc-hd_20250313.pdf", "rb") as f:
      file_bytes = f.read()

Feb2025_pdf = types.Part.from_bytes(
    data=file_bytes,
    mime_type='application/pdf',
)

with open("samples/saintmarc-hd_20250213.pdf", "rb") as f:
      file_bytes = f.read()

Jan2025_pdf = types.Part.from_bytes(
    data=file_bytes,
    mime_type='application/pdf',
)

In [5]:
SYSTEM_LEVEL_INSTRUCTION = """

You are perfectly bilingual, fluent in both Japanese and English. 

You are also an expert data analyst, specializing in information extraction from PDFs.

You especially enjoy parsing out tabular data, always being completely accurate when extracting table parts, such as the row and column headers, and table cells.

You always understand the layout of a table, and know how to return empty values.

""".strip()

In [6]:
initial_prompt="""

Retrieve the entire contents for the 月次売上情報 table in the given PDF.

""".strip()

#### Model parsing behavior is inconsistent with only a simple prompt

In [7]:
%%time

response = client.models.generate_content(
        model=model_id,

        config=types.GenerateContentConfig(
            system_instruction=SYSTEM_LEVEL_INSTRUCTION,
            response_mime_type='application/json', 
        ),
        
        contents=[
            Feb2025_pdf,            
            initial_prompt,
        ]
    )

#print(response.text)
json.loads(response.text)[:2]

CPU times: user 9.44 ms, sys: 203 μs, total: 9.65 ms
Wall time: 8.4 s


[{'年度': '2022',
  '4月': '118.9',
  '5月': '144.0',
  '6月': '126.3',
  '7月': '110.7',
  '8月': '124.0',
  '9月': '127.2',
  '上半期': '124.5',
  '10月': '115.7',
  '11月': '107.4',
  '12月': '106.1',
  '1月': '122.5',
  '2月': '140.1',
  '3月': '120.5',
  '通期': '120.7',
  '売上': '全店売上',
  '対比': '昨年対比',
  '単位': '(%)'},
 {'年度': '2023',
  '4月': '116.0',
  '5月': '110.7',
  '6月': '109.5',
  '7月': '117.6',
  '8月': '119.1',
  '9月': '114.3',
  '上半期': '114.6',
  '10月': '106.5',
  '11月': '108.6',
  '12月': '108.8',
  '1月': '108.1',
  '2月': '107.9',
  '3月': '110.3',
  '通期': '111.3',
  '売上': '全店売上',
  '対比': '昨年対比',
  '単位': '(%)'}]

In [8]:
%%time

response = client.models.generate_content(
        model=model_id,

        config=types.GenerateContentConfig(
            system_instruction=SYSTEM_LEVEL_INSTRUCTION,
            response_mime_type='application/json', 
        ),
        
        contents=[
            Jan2025_pdf,            
            initial_prompt,
        ]
    )

#print(response.text)
json.loads(response.text)[:2]

CPU times: user 6.42 ms, sys: 0 ns, total: 6.42 ms
Wall time: 7.23 s


[{'年度': '2022',
  '4月': '118.9',
  '5月': '144.0',
  '6月': '126.3',
  '7月': '110.7',
  '8月': '124.0',
  '9月': '127.2',
  '上半期': '124.5',
  '10月': '115.7',
  '11月': '107.4',
  '12月': '106.1',
  '1月': '122.5',
  '2月': '140.1',
  '3月': '120.5',
  '通期': '120.7',
  '売上タイプ': '全店売上',
  '対比': '昨年対比',
  '単位': '(%)'},
 {'年度': '2023',
  '4月': '116.0',
  '5月': '110.7',
  '6月': '109.5',
  '7月': '117.6',
  '8月': '119.1',
  '9月': '114.3',
  '上半期': '114.6',
  '10月': '106.5',
  '11月': '108.6',
  '12月': '108.8',
  '1月': '108.1',
  '2月': '107.9',
  '3月': '110.3',
  '通期': '111.3',
  '売上タイプ': '全店売上',
  '対比': '昨年対比',
  '単位': '(%)'}]

## Prompt design

We have just seen that with only a minimal prompt, the model may well return the extracted information with varying format.

To improve consistency, we need to add more clarification to our prompt.

In [9]:
table_prompt="""
Retrieve the entire contents for the 月次売上情報 table in the given PDF.

Parse this table row by row, starting from the top row and going to the bottom row of the table.
Each row represents year-on-year percent change in the sales figures for a given month and year.

Parse all 15 columns for each row, starting from left to right.
This table has the following 15 columns:
- 年度
- 4月
- 5月
- 6月
- 7月
- 8月
- 9月
- 上半期
- 10月
- 11月
- 12月
- 1月
- 2月
- 3月
- 通期

There are 2 major row groupings:
- 全店売上
- 既存店売上

Ignore the unneeded row header for 昨年対比（％）.

For each major row group listed above, there are 3 rows for:
- yyyy for 2 years prior
- yyyy for the previous year
- yyyy for the current year

If any of the 15 column values in a row is empty, please represent this with an empty string.

""".strip()

----

## Structured output

First we define an API for calling the model given a model name, a (user) prompt, the PDF file bytes, and Pydantic Python class(es) that describes the exact data structure of our desired response.

In [10]:
def extract_structured_data(model_id:str, prompt:str, file_part:types.Part, response_schema:BaseModel):
    """Given the Google Gen AI model,
       a prompt for the model,
       the raw bytes from a PDF file (max. 20MB!) which we send inline with our request,
       and a schema for the structured response we wish to obtain:

       Return a structured response (MIME type application/json) for the given prompt.
    """
    response = client.models.generate_content(
        model=model_id,

        config=types.GenerateContentConfig(
            system_instruction=SYSTEM_LEVEL_INSTRUCTION,
            response_mime_type='application/json', 
            response_schema=response_schema,
            seed=42
        ),
        
        contents=[
            file_part,            
            prompt,
        ]
    )
    
    # Convert the response to the Pydantic model and return it
    #print(response)
    return response.parsed

In [11]:
class ColumnNameEnum(enum.Enum):
    NENDO = '年度'
    APR = '4月'
    MAY = '5月'
    JUN = '6月'
    JUL = '7月'
    AUG = '8月'
    SEP = '9月'
    KAMI = '上半期'
    OCT = '10月'
    NOV = '11月'
    DEC = '12月'
    JAN = '1月'
    FEB = '2月'
    MAR = '3月'
    TSUUKI = '通期'


class TableCellValue(BaseModel):
    key: ColumnNameEnum = Field(description="column name for this corresponding table cell")
    value: str = Field(description="the string representation of the numeric value in this table cell")


class TableRow(BaseModel):
    """ Data structure describing a row of values in the target table. """
    header: str = Field(description="row header for 売上 type, one of: 全店売上 or 既存店売上")
    values: list[TableCellValue] = Field(description="list of the TableCellValues for this TableRow")


class Table(BaseModel):
    """ Data structure that models the entire target table in a PDF. """
    rows: list[TableRow] = Field(description="List of TableRow objects that make up the data in the table")

----

#### 2025-February 月次売上情報


![月次売上情報 from the 2025-Feb PDF](samples/saintmarc-hd_20250313.pdf.png "月次売上情報 from the 2025-Feb PDF")

* NOTE that the PDF for 2025-February is actually an embedded image, and this cannot be parsed using our regular bag of text-based PDF-parsing tricks!

In [12]:
%%time

table_data = extract_structured_data(model_id, table_prompt, Feb2025_pdf, Table)

# let's turn the model results into a nice DataFrame!
row_multi = pd.MultiIndex.from_tuples([
    (row['header'], row['values'][0]['value']) 
    for row in table_data.model_dump()['rows']
])

data = [
    {e['key'].value: e['value'] for e in row['values'][1:]} 
    for row in table_data.model_dump()['rows']
]

tmp_df = pd.DataFrame(data, index=row_multi)
tmp_df

CPU times: user 10.2 ms, sys: 4.9 ms, total: 15.1 ms
Wall time: 12.8 s


Unnamed: 0,Unnamed: 1,4月,5月,6月,7月,8月,9月,上半期,10月,11月,12月,1月,2月,3月,通期
全店売上,2022,118.9,144.0,126.3,110.7,124.0,127.2,124.5,115.7,107.4,106.1,122.5,140.1,120.5,120.7
全店売上,2023,116.0,110.7,109.5,117.6,119.1,114.3,114.6,106.5,108.6,108.8,108.1,107.9,110.3,111.3
全店売上,2024,102.6,102.4,109.9,100.7,106.6,105.6,104.6,98.8,104.5,101.8,101.2,102.5,,
既存店売上,2022,115.1,126.0,122.8,111.5,124.8,127.4,120.9,115.8,107.0,106.3,123.6,143.7,123.8,119.6
既存店売上,2023,119.7,114.6,113.8,120.7,122.9,117.1,118.2,110.3,113.1,113.3,113.2,112.8,115.1,115.5
既存店売上,2024,107.1,106.3,113.9,105.2,110.5,110.3,108.8,103.8,109.2,104.9,104.0,104.6,,


<hr width=40%/>

#### 2025-January 月次売上情報

![月次売上情報 from the 2025-Jan PDF](samples/saintmarc-hd_20250213.pdf.png "月次売上情報 from the 2025-Jan PDF")

In [13]:
%%time

table_data = extract_structured_data(model_id, table_prompt, Jan2025_pdf, Table)

# let's turn the model results into a nice DataFrame!
row_multi = pd.MultiIndex.from_tuples([
    (row['header'], row['values'][0]['value']) 
    for row in table_data.model_dump()['rows']
])

data = [
    {e['key'].value: e['value'] for e in row['values'][1:]} 
    for row in table_data.model_dump()['rows']
]

tmp_df = pd.DataFrame(data, index=row_multi)
tmp_df

CPU times: user 9.89 ms, sys: 237 μs, total: 10.1 ms
Wall time: 11.4 s


Unnamed: 0,Unnamed: 1,4月,5月,6月,7月,8月,9月,上半期,10月,11月,12月,1月,2月,3月,通期
全店売上,2022,118.9,144.0,126.3,110.7,124.0,127.2,124.5,115.7,107.4,106.1,122.5,140.1,120.5,120.7
全店売上,2023,116.0,110.7,109.5,117.6,119.1,114.3,114.6,106.5,108.6,108.8,108.1,107.9,110.3,111.3
全店売上,2024,102.6,102.4,109.9,100.7,106.6,105.6,104.6,98.8,104.5,101.8,101.2,,,
既存店売上,2022,115.1,126.0,122.8,111.5,124.8,127.4,120.9,115.8,107.0,106.3,123.6,143.7,123.8,119.6
既存店売上,2023,119.7,114.6,113.8,120.7,122.9,117.1,118.2,110.3,113.1,113.3,113.2,112.8,115.1,115.5
既存店売上,2024,107.1,106.3,113.9,105.2,110.5,110.3,108.8,103.8,109.2,104.9,104.0,,,


----

In [14]:
# housekeeping...
for f in client.files.list():
    client.files.delete(name=f.name)