# Google's Gemini 2.0 

## Information Extraction: Tabular Data

In [1]:
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

In [2]:
import pandas as pd

from google import genai
from google.genai import types

from pydantic import BaseModel, Field

In [3]:
# Create a client
client = genai.Client(api_key=GEMINI_API_KEY)

# Specify the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

----

#### PDF ...

Since the PDF file in our example is less than 20MB, we can send its contents inline with our request to the model.

In [4]:
os.stat("samples/Press_release_car_registrations_February_2025.pdf")

os.stat_result(st_mode=33188, st_ino=927053, st_dev=2049, st_nlink=1, st_uid=1001, st_gid=1002, st_size=186261, st_atime=1745387131, st_mtime=1744864115, st_ctime=1744864115)

In [5]:
with open("samples/Press_release_car_registrations_February_2025.pdf", "rb") as f:
      file_bytes = f.read()

Feb2025_pdf = types.Part.from_bytes(
    data=file_bytes,
    mime_type='application/pdf',
)

----

### Function for calling the Google Gen AI model

* specifies that the response will be of MIME type `application/json` in the given `response_schema`
* sets the system instructions
* sends the input file along with the user prompt

In [6]:
def extract_structured_data(model_id:str, prompt:str, file_part:types.Part, response_schema:BaseModel):
    """Given the Google Gen AI model,
       a prompt for the model,
       the raw bytes from a PDF file (max. 20MB!) which we send inline with our request,
       and a schema for the structured response we wish to obtain:

       Return a structured response (MIME type application/json) for the given prompt.
    """
    response = client.models.generate_content(
        model=model_id,

        config=types.GenerateContentConfig(
            system_instruction=SYSTEM_LEVEL_INSTRUCTION,
            response_mime_type='application/json', 
            response_schema=response_schema,
            seed=42
        ),
        
        contents=[
            file_part,            
            prompt,
        ]
    )
    
    # Convert the response to the Pydantic model and return it
    #print(response)
    return response.parsed

----

In [7]:
SYSTEM_LEVEL_INSTRUCTION = (
    "You are an expert data analyst, specializing in information extraction from PDFs.  "
    "You especially enjoy parsing out tabular data, always being completely accurate when extracting table parts "
    "such as the row and column headers, and table cells. "
    "You always understand the layout of a table, and know how to return empty values."
).strip()

----

#### NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, 2025-February

![ACEA Press Release, page 3, NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, 2025-Feb](samples/Press_release_car_registrations_February_2025_p3.png "NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, page 3, ACEA Press Release, 2025-Feb")

c.f. NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY on page 3, [samples/Press_release_car_registrations_February_2025.pdf](samples/Press_release_car_registrations_February_2025.pdf)

### Prompt design: NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY

In [8]:
table1_prompt="""
Retrieve the entire contents for the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE, MONTHLY table in the given PDF.

Parse this table row by row, starting from the top row and going to the bottom row of the table.
Each row represents new car registrations for a country or geopolitical entity.

Parse all 21 columns for each row, starting from left to right.
The 21 columns are grouped into 7 major column categories.

There are 7 major column categories:
- BATTERY ELECTRIC
- PLUG-IN HYBRID
- HYBRID ELECTRIC
- OTHERS
- PETROL
- DIESEL
- TOTAL

Each major column category has 3 values for:
- current month in current year
- current month in previous year
- % change current yy / previous yy

If any of the 3 values in a major column category are missing or empty, represent that value with ??.

Be aware that it is possible for all 3 values in a major column category to be empty!
Example:
|        | BATTERY ELECTRIC | PLUG-IN HYBRID | ...
|Romania | 1,164 217 -65 |   | ...
""".strip()

#### Zero-shot / Minimal Prompt Design

_... will likely not work!_


In [9]:
%%time

response = client.models.generate_content(
        model=model_id,

        config=types.GenerateContentConfig(
            system_instruction=SYSTEM_LEVEL_INSTRUCTION,
            response_mime_type='application/json', 
        ),
        
        contents=[
            Feb2025_pdf,            
            table1_prompt,
        ]
    )

print(response.text)

[
  {
    "BATTERY ELECTRIC": {
      "February 2025": "4,233",
      "February 2024": "3,322",
      "% change 25/24": "+27.4"
    },
    "PLUG-IN HYBRID": {
      "February 2025": "1,613",
      "February 2024": "1,335",
      "% change 25/24": "+20.8"
    },
    "HYBRID ELECTRIC": {
      "February 2025": "5,549",
      "February 2024": "4,691",
      "% change 25/24": "+18.3"
    },
    "OTHERS": {
      "February 2025": "0",
      "February 2024": "0",
      "% change 25/24": "??"
    },
    "PETROL": {
      "February 2025": "5,736",
      "February 2024": "6,527",
      "% change 25/24": "-12.1"
    },
    "DIESEL": {
      "February 2025": "2,488",
      "February 2024": "4,135",
      "% change 25/24": "-39.8"
    },
    "TOTAL": {
      "February 2025": "19,619",
      "February 2024": "20,010",
      "% change 25/24": "-2.0"
    }
  },
  {
    "BATTERY ELECTRIC": {
      "February 2025": "13,040",
      "February 2024": "9,385",
      "% change 25/24": "+38.9"
    },
    "PL

### Structured output: 

#### NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY

_... we need to improve the accuracy of the output for this information retrieval operation... but how?_

![Accuracy optimization, pattern 1: prompt design & structured output](samples/accuracy_optimization_pattern_01.png "Optimizing accuracy in this use case via prompt design and structured output")

In [10]:
class Table1Row(BaseModel):
    name: str = Field(description="name of country or geopolitical entity for this table row")
    
    values: list[str] = Field(description=(
        "list of exactly 21 values, 3 for each the following major column category: BATTERY ELECTRIC, PLUG-IN HYBRID, HYBRID ELECTRIC, OTHERS, PETROL, DIESEL, and TOTAL. if any of the values are blank or missing, represent with an ??"
    ))

class Table1(BaseModel):
    rows: list[Table1Row] = Field(description="List of Table1Row objects that make up the table")

In [11]:
%%time

table_data = extract_structured_data(model_id, table1_prompt, Feb2025_pdf, Table1)

CPU times: user 13.1 ms, sys: 114 μs, total: 13.2 ms
Wall time: 32.8 s


In [12]:
row_headers, data = [], []
for row in table_data.model_dump()['rows']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

(34, 21)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
Austria,4233.0,3322,27.4,1613.0,1335.0,20.8,5549,4691,18.3,0,...,,5736,6527,-12.1,2488,4135.0,-39.8,19619,20010,-2.0
Belgium,13040.0,9385,38.9,3070.0,8385.0,-63.4,5383,4282,25.7,267,...,-35.7,17280,18918,-8.7,1121,2337.0,-52.0,40161,43722,-8.1
Bulgaria,126.0,122,3.3,34.0,31.0,9.7,105,73,43.8,0,...,,2781,2868,-3.0,348,510.0,-31.8,3394,3604,-5.8
Croatia,53.0,50,6.0,140.0,94.0,48.9,1629,1455,12.0,101,...,-8.2,1644,1898,-13.4,678,923.0,-26.5,4245,4530,-6.3
Cyprus,107.0,105,1.9,78.0,48.0,62.5,579,609,-4.9,0,...,,456,712,-36.0,64,27.0,137.0,1284,1501,-14.5
Czechia,737.0,438,68.3,557.0,450.0,23.8,3634,3577,1.6,470,...,-18.8,8844,9723,-9.0,3531,3561.0,-0.8,17773,18328,-3.0
Denmark,7724.0,4974,55.3,312.0,525.0,-40.6,1453,1941,-25.1,0,...,,1908,3415,-44.1,220,363.0,-39.4,11617,11218,3.6
Estonia,59.0,89,-33.7,69.0,59.0,16.9,307,609,-49.6,0,...,-100.0,132,430,-69.3,57,244.0,-76.6,624,1440,-56.7
Finland,1563.0,1330,17.5,1035.0,1237.0,-16.3,1398,1735,-19.4,0,...,-100.0,598,736,-18.8,190,297.0,-36.0,4784,5374,-11.0
France,25.335,25825,-1.9,6451.0,11732.0,-45.0,62146,41227,50.7,5821,...,5.9,35110,48095,-27.0,6707,10221.0,-34.4,141570,142595,-0.7


In [13]:
df.loc[[
    'Austria',
    'Bulgaria',
    'Cyprus',
    'Denmark',
    'Ireland',
    'Latvia',
    'Luxembourg', 
    'Malta',
    'Romania',    # empty values at indices 3, 4, 5!
    'Iceland', 
    'Norway',
    'United Kingdom'
]].T

Unnamed: 0,Austria,Bulgaria,Cyprus,Denmark,Ireland,Latvia,Luxembourg,Malta,Romania,Iceland,Norway,United Kingdom
0,4233.0,126.0,107.0,7724.0,2512.0,80.0,1134.0,46.0,724.0,248.0,8477.0,21244.0
1,3322.0,122.0,105.0,4974.0,1856.0,78.0,942.0,260.0,1109.0,97.0,6043.0,14991.0
2,27.4,3.3,1.9,55.3,35.3,2.6,20.4,-82.3,-34.7,155.7,40.3,41.7
3,1613.0,34.0,78.0,312.0,2100.0,134.0,291.0,27.0,,104.0,136.0,7273.0
4,1335.0,31.0,48.0,525.0,1420.0,28.0,385.0,58.0,,96.0,122.0,6098.0
5,20.8,9.7,62.5,-40.6,47.9,378.6,-24.4,-53.4,,8.3,11.5,19.3
6,5549.0,105.0,579.0,1453.0,3137.0,0.0,1196.0,98.0,5510.0,199.0,154.0,29849.0
7,4691.0,73.0,609.0,1941.0,2940.0,0.0,1002.0,124.0,3736.0,95.0,259.0,26140.0
8,18.3,43.8,-4.9,-25.1,6.7,,19.4,-21.0,47.5,109.5,-40.5,14.2
9,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,1354.0,0.0,0.0,0.0


In [14]:
df.to_csv('samples/acea_1_202502_raw.csv', encoding='CP932', header=False)

----

#### NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, 2025-February

![ACEA Press Release, page 6, NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, 2025-Feb](samples/Press_release_car_registrations_February_2025_p6.png "NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, page 6, ACEA Press Release, 2025-Feb")

c.f. NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK on page 6, [samples/Press_release_car_registrations_February_2025.pdf](samples/Press_release_car_registrations_February_2025.pdf)

In [15]:
table2_prompt ="""
Retrieve the entire contents for the NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK table on in the given PDF.

Parse this table row by row, starting from the top row and going to the bottom row of the table.
Each row represents new car registrations for a automobile manufacturer or manufacturing group.

Parse all 10 columns for each row, starting from left to right.
The 10 columns are grouped into 2 major column categories.

There are 2 major column categories:
- current month
- JANUARY-current month

For each major column category, you must extract exactly 5 values:
- % share current year
- % share previous year
- Units current year
- Units previous year
- % change current yy / previous yy

If there are any values that are missing or empty, represent that value with ??.
""".strip()

In [16]:
class Table2Row(BaseModel):
    name: str = Field(description="name of automobile manufacturer or manufacturer group for this table row. do not include any any superscript characters")
    
    values: list[str] = Field(description=(
        "list of exactly 10 values, 5 for each the following major column categories: current month, and JANUARY-current month. if any of the values are blank or missing, represent with an ??"
    ))

class Table2(BaseModel):
    rows: list[Table2Row] = Field(description="List of Table2Row objects that make up the table")

In [17]:
%%time

table_data = extract_structured_data(model_id, table2_prompt, Feb2025_pdf, Table2)

CPU times: user 6.11 ms, sys: 3.25 ms, total: 9.36 ms
Wall time: 25.3 s


In [18]:
row_headers, data = [], []
for row in table_data.model_dump()['rows']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

(46, 10)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Volkswagen Group,25.9,25.0,249873,248647,0.5,26.4,25.0,516709,503487,2.6
Volkswagen,10.8,9.7,103681,96562,7.4,11.1,9.6,216565,193379,12.0
Skoda,5.7,5.9,55324,58854,-6.0,5.8,6.0,114314,120441,-5.1
Audi,4.8,4.6,46593,45927,1.5,4.9,4.9,95822,99291,-3.5
Cupra,2.1,1.6,20397,15418,32.3,2.1,1.4,40869,28728,42.3
Seat,1.8,2.2,16998,21884,-22.3,1.7,2.1,33575,41946,-20.0
Porsche,0.7,1.0,6267,9486,-33.9,0.7,0.9,14218,18524,-23.2
Others,0.1,0.1,613,516,18.8,0.1,0.1,1346,1178,14.3
Stellantis,16.2,18.7,155970,186151,-16.2,15.8,18.4,310091,369469,-16.1
Peugeot,5.8,5.8,56016,58094,-3.6,5.7,5.7,111443,115484,-3.5


In [19]:
df.to_csv('samples/acea_2_202502_raw.csv', encoding='CP932', header=False)

<hr width=40%/>

### Check: 2025-Jan PDF



In [20]:
with open("samples/Press_release_car_registrations_January_2025_rev.pdf", "rb") as f:
      file_bytes = f.read()

Jan2025_pdf = types.Part.from_bytes(
    data=file_bytes,
    mime_type='application/pdf',
)

### NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, 2025-January

![ACEA Press Release, page 3, NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, 2025-Jan](samples/Press_release_car_registrations_January_2025_p3.png "NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, page 3, ACEA Press Release, 2025-Jan")

c.f. NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY on page 3, [samples/Press_release_car_registrations_January_2025_rev.pdf](samples/Press_release_car_registrations_January_2025_rev.pdf)

In [21]:
%%time

table_data = extract_structured_data(model_id, table1_prompt, Jan2025_pdf, Table1)

CPU times: user 13.2 ms, sys: 137 μs, total: 13.4 ms
Wall time: 35.8 s


In [22]:
row_headers, data = [], []
for row in table_data.model_dump()['rows']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

(34, 21)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
Austria,3822,2823,35.4,1642,1469,11.8,5405,4069,32.8,0,...,-100.0,6505,5681.0,14.5,3074,3506,-12.3,20448,17552.0,16.5
Belgium,13712,9995,37.2,3996,11972,-66.6,4458,4170,6.9,293,...,10.2,16600,17461.0,-4.9,1535,2803,-45.2,40594,46667.0,-13.0
Bulgaria,142,162,-12.3,38,49,-22.4,78,61,27.9,0,...,,2228,3270.0,-31.9,294,435,-32.4,2780,3977.0,-30.1
Croatia,69,82,-15.9,152,119,27.7,1556,1216,28.0,142,...,11.8,1977,2045.0,-3.3,576,903,-36.2,4472,4492.0,-0.4
Cyprus,149,65,129.2,93,68,36.8,689,679,1.5,0,...,,609,630.0,-3.3,25,85,-70.6,1565,1527.0,2.5
Czechia,981,473,107.4,641,431,48.7,4225,3386,24.8,378,...,-34.9,9362,11167.0,-16.2,3760,4323,-13.0,19347,20361.0,-5.0
Denmark,6961,3117,123.3,190,366,-48.1,1377,1746,-21.1,0,...,,1955,3277.0,-40.3,351,333,5.4,10834,8839.0,22.6
Estonia,83,136,-39.0,77,55,40.0,254,683,-62.8,0,...,-100.0,82,413.0,-80.1,35,281,-87.5,531,1577.0,-66.3
Finland,1639,1587,3.3,1366,1545,-11.6,1776,2284,-22.2,0,...,-100.0,706,801.0,-11.9,255,354,-28.0,5742,6590.0,-12.9
France,19923,20017,-0.5,4852,10549,-54.0,51447,33810,52.2,3521,...,-46.3,29974,41728.0,-28.2,4956,9620,-48.5,114673,122285.0,-6.2


In [23]:
df.loc[[
    'Bulgaria',
    'Cyprus',
    'Denmark',
    'Ireland',
    'Latvia',
    'Luxembourg', 
    'Malta',
    'Romania', 
    'Iceland', 
    'Norway',
    'United Kingdom'
]].T

Unnamed: 0,Bulgaria,Cyprus,Denmark,Ireland,Latvia,Luxembourg,Malta,Romania,Iceland,Norway,United Kingdom
0,142.0,149.0,6961.0,4923.0,84.0,1167.0,211.0,1164.0,226.0,8954.0,29634.0
1,162.0,65.0,3117.0,4093.0,86.0,779.0,253.0,1631.0,169.0,4717.0,20935.0
2,-12.3,129.2,123.3,20.3,-2.3,49.8,-16.6,-28.6,33.7,89.8,41.6
3,38.0,93.0,190.0,4915.0,126.0,319.0,44.0,0.0,186.0,95.0,12598.0
4,49.0,68.0,366.0,2999.0,32.0,396.0,61.0,0.0,90.0,94.0,11944.0
5,-22.4,36.8,-48.1,63.9,293.8,-19.4,-27.9,,106.7,1.1,5.5
6,78.0,689.0,1377.0,8725.0,0.0,957.0,134.0,5284.0,61.0,124.0,51785.0
7,61.0,679.0,1746.0,7761.0,0.0,847.0,118.0,4215.0,72.0,164.0,47435.0
8,27.9,1.5,-21.1,12.4,,13.0,13.6,25.4,-15.3,-24.4,9.2
9,0.0,0.0,0.0,0.0,36.0,0.0,0.0,1511.0,0.0,0.0,0.0


<hr width=40%/>

#### NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, 2025-January

![ACEA Press Release, page 6, NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, 2025-Jan](samples/Press_release_car_registrations_January_2025_p6.png "NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, page 6, ACEA Press Release, 2025-Jan")

c.f. NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK on page 6, [samples/Press_release_car_registrations_January_2025_rev.pdf](samples/Press_release_car_registrations_January_2025_rev.pdf)

In [24]:
%%time

table_data = extract_structured_data(model_id, table2_prompt, Jan2025_pdf, Table2)

CPU times: user 10.5 ms, sys: 820 μs, total: 11.3 ms
Wall time: 26.8 s


In [25]:
row_headers, data = [], []
for row in table_data.model_dump()['rows']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

(46, 10)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Volkswagen Group,27.0,25.1,268409,254840,5.3,27.0,25.1,268409,254840,5.3
Volkswagen,11.3,9.5,112844,96817,16.6,11.3,9.5,112844,96817,16.6
Skoda,6.0,6.1,59465,61587,-3.4,6.0,6.1,59465,61587,-3.4
Audi,4.9,5.2,49220,53364,-7.8,4.9,5.2,49220,53364,-7.8
Cupra,2.1,1.3,20855,13310,56.7,2.1,1.3,20855,13310,56.7
Seat,1.7,2.0,17161,20062,-14.5,1.7,2.0,17161,20062,-14.5
Porsche,0.8,0.9,8120,9038,-10.2,0.8,0.9,8120,9038,-10.2
Others,0.1,0.1,744,662,12.4,0.1,0.1,744,662,12.4
Stellantis,15.5,18.0,154079,183319,-16.0,15.5,18.0,154079,183319,-16.0
Peugeot,5.6,5.6,55445,57390,-3.4,5.6,5.6,55445,57390,-3.4


----

In [26]:
# housekeeping...
for f in client.files.list():
    client.files.delete(name=f.name)