# Google's Gemini 2.0 

## Information Extraction: Tabular Data

In [1]:
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

In [2]:
import pandas as pd

from google import genai
from google.genai import types

from pydantic import BaseModel, Field

In [3]:
# Create a client
client = genai.Client(api_key=GEMINI_API_KEY)

# Specify the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

----

#### PDF ...

Since the PDF file in our example is less than 20MB, we can send its contents inline with our request to the model.

In [4]:
os.stat("samples/Press_release_car_registrations_February_2025.pdf")

os.stat_result(st_mode=33188, st_ino=927053, st_dev=2049, st_nlink=1, st_uid=1001, st_gid=1002, st_size=186261, st_atime=1745298940, st_mtime=1744864115, st_ctime=1744864115)

In [5]:
with open("samples/Press_release_car_registrations_February_2025.pdf", "rb") as f:
      file_bytes = f.read()

pdf_file_part = types.Part.from_bytes(
    data=file_bytes,
    mime_type='application/pdf',
)

----

### Function for calling the Google Gen AI model

* specifies that the response will be of MIME type `application/json` in the given `response_schema`
* sets the system instructions
* sends the input file along with the user prompt

In [6]:
def extract_structured_data(model_id:str, prompt:str, file_part:types.Part, response_schema:BaseModel):
    """Given the Google Gen AI model,
       a prompt for the model,
       the raw bytes from a PDF file (max. 20MB!) which we send inline with our request,
       and a schema for the structured response we wish to obtain:

       Return a structured response (MIME type application/json) for the given prompt.
    """
    response = client.models.generate_content(
        model=model_id,

        config=types.GenerateContentConfig(
            system_instruction=SYSTEM_LEVEL_INSTRUCTION,
            response_mime_type='application/json', 
            response_schema=response_schema,
            seed=42
        ),
        
        contents=[
            file_part,            
            prompt,
        ]
    )
    
    # Convert the response to the Pydantic model and return it
    #print(response)
    return response.parsed

----

In [7]:
SYSTEM_LEVEL_INSTRUCTION = (
    "You are an expert data analyst, specializing in information extraction from PDFs.  "
    "You especially enjoy parsing out tabular data, always being completely accurate when extracting table parts "
    "such as the row and column headers, and table cells. "
    "You always understand the layout of a table, and know how to return empty values."
).strip()

----

### ACEA Press Release, 2025-Feb

![ACEA Press Release, page 3, NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, 2025-Feb](samples/Press_release_car_registrations_February_2025_p3.png "NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, page 3, ACEA Press Release, 2025-Feb")

c.f. NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY on page 3, [samples/Press_release_car_registrations_February_2025.pdf](samples/Press_release_car_registrations_February_2025.pdf)

### Pydantic models describing the response structure

#### NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY

c.f. page 3 of the PDF

In [8]:
class Table1Row(BaseModel):
    name: str = Field(description="name of country or geopolitical entity for this table row")
    
    values: list[str] = Field(description=(
        "list of exactly 21 values, 3 for each the following major column category: BATTERY ELECTRIC, PLUG-IN HYBRID, HYBRID ELECTRIC, OTHERS, PETROL, DIESEL, and TOTAL. if any of the values are blank or missing, represent with an ??"
    ))

class Table1(BaseModel):
    rows: list[Table1Row] = Field(description="List of Table1Row objects that make up the table")

In [9]:
%%time

prompt="""
Retrieve the entire contents for the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE, MONTHLY table in the given PDF.

Parse this table row by row, from left to right.

There are 7 major column categories:
- BATTERY ELECTRIC
- PLUG-IN HYBRID
- HYBRID ELECTRIC
- OTHERS
- PETROL
- DIESEL
- TOTAL

Each major column category has 3 values for:
- current month in current year
- current month in previous year
- % change current yy / previous yy

If any of the 3 values in a major column category are missing or empty, represent that value with ??.

Be aware that it is possible for all 3 values in a major column category to be empty!
""".strip()

table_data = extract_structured_data(model_id, prompt, pdf_file_part, Table1)

CPU times: user 22.3 ms, sys: 478 μs, total: 22.8 ms
Wall time: 32.4 s


In [10]:
row_headers, data = [], []
for row in table_data.model_dump()['rows']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

(34, 21)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
Austria,4233.0,3322,27.4,1613.0,1335.0,20.8,5549,4691,18.3,0,...,,5736,6527,-12.1,2488,4135.0,-39.8,19619,20010,-2.0
Belgium,13040.0,9385,38.9,3070.0,8385.0,-63.4,5383,4282,25.7,267,...,-35.7,17280,18918,-8.7,1121,2337.0,-52.0,40161,43722,-8.1
Bulgaria,126.0,122,3.3,34.0,31.0,9.7,105,73,43.8,0,...,,2781,2868,-3.0,348,510.0,-31.8,3394,3604,-5.8
Croatia,53.0,50,6.0,140.0,94.0,48.9,1629,1455,12.0,101,...,-8.2,1644,1898,-13.4,678,923.0,-26.5,4245,4530,-6.3
Cyprus,107.0,105,1.9,78.0,48.0,62.5,579,609,-4.9,0,...,,456,712,-36.0,64,27.0,137.0,1284,1501,-14.5
Czechia,737.0,438,68.3,557.0,450.0,23.8,3634,3577,1.6,470,...,-18.8,8844,9723,-9.0,3531,3561.0,-0.8,17773,18328,-3.0
Denmark,7724.0,4974,55.3,312.0,525.0,-40.6,1453,1941,-25.1,0,...,,1908,3415,-44.1,220,363.0,-39.4,11617,11218,3.6
Estonia,59.0,89,-33.7,69.0,59.0,16.9,307,609,-49.6,0,...,-100.0,132,430,-69.3,57,244.0,-76.6,624,1440,-56.7
Finland,1563.0,1330,17.5,1035.0,1237.0,-16.3,1398,1735,-19.4,0,...,-100.0,598,736,-18.8,190,297.0,-36.0,4784,5374,-11.0
France,25.335,25825,-1.9,6451.0,11732.0,-45.0,62146,41227,50.7,5821,...,5.9,35110,48095,-27.0,6707,10221.0,-34.4,141570,142595,-0.7


In [11]:
df.loc[[
    'Austria',
    'Bulgaria',
    'Cyprus',
    'Denmark',
    'Ireland',
    'Latvia',
    'Luxembourg', 
    'Malta',
    'Romania',    # empty values at indices 3, 4, 5!
    'Iceland', 
    'Norway',
    'United Kingdom'
]].T

Unnamed: 0,Austria,Bulgaria,Cyprus,Denmark,Ireland,Latvia,Luxembourg,Malta,Romania,Iceland,Norway,United Kingdom
0,4233.0,126.0,107.0,7724.0,2512.0,80.0,1134.0,46.0,724.0,248.0,8477.0,21244.0
1,3322.0,122.0,105.0,4974.0,1856.0,78.0,942.0,260.0,1109.0,97.0,6043.0,14991.0
2,27.4,3.3,1.9,55.3,35.3,2.6,20.4,-82.3,-34.7,155.7,40.3,41.7
3,1613.0,34.0,78.0,312.0,2100.0,134.0,291.0,27.0,,104.0,136.0,7273.0
4,1335.0,31.0,48.0,525.0,1420.0,28.0,385.0,58.0,,96.0,122.0,6098.0
5,20.8,9.7,62.5,-40.6,47.9,378.6,-24.4,-53.4,,8.3,11.5,19.3
6,5549.0,105.0,579.0,1453.0,3137.0,0.0,1196.0,98.0,5510.0,199.0,154.0,29849.0
7,4691.0,73.0,609.0,1941.0,2940.0,0.0,1002.0,124.0,3736.0,95.0,259.0,26140.0
8,18.3,43.8,-4.9,-25.1,6.7,,19.4,-21.0,47.5,109.5,-40.5,14.2
9,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,1354.0,0.0,0.0,0.0


In [12]:
df.to_csv('samples/acea_1_202502_raw.csv', encoding='CP932', header=False)

----

### Pydantic models describing the response structure

#### NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK

![ACEA Press Release, page 6, NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, 2025-Feb](samples/Press_release_car_registrations_February_2025_p6.png "NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, page 6, ACEA Press Release, 2025-Feb")

c.f. NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK on page 6, [samples/Press_release_car_registrations_February_2025.pdf](samples/Press_release_car_registrations_February_2025.pdf)

In [13]:
class Table2Row(BaseModel):
    name: str = Field(description="name of automobile manufacturer or manufacturer group for this table row. do not include any any superscript characters")
    
    values: list[str] = Field(description=(
        "list of exactly 10 values, 5 for each the following major column categories: current month, and JANUARY-current month. if any of the values are blank or missing, represent with an ??"
    ))

class Table2(BaseModel):
    rows: list[Table2Row] = Field(description="List of Table2Row objects that make up the table")

In [14]:
%%time

prompt="""
Retrieve the entire contents for the NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK table on in the given PDF.

Parse this table row by row, from left to right.

There are 2 major column categories:
- current month
- JANUARY-current month

For each major column category, you must extract exactly 5 values:
- % share current year
- % share previous year
- Units current year
- Units previous year
- % change current yy / previous yy

If there are any values that are missing or empty, represent that value with ??.
""".strip()

table_data = extract_structured_data(model_id, prompt, pdf_file_part, Table2)
#table_data

CPU times: user 9.35 ms, sys: 431 μs, total: 9.78 ms
Wall time: 24.1 s


In [15]:
row_headers, data = [], []
for row in table_data.model_dump()['rows']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

(46, 10)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Volkswagen Group,25.9,25.0,249873,248647,0.5,26.4,25.0,516709,503487,2.6
Volkswagen,10.8,9.7,103681,96562,7.4,11.1,9.6,216565,193379,12.0
Skoda,5.7,5.9,55324,58854,-6.0,5.8,6.0,114314,120441,-5.1
Audi,4.8,4.6,46593,45927,1.5,4.9,4.9,95822,99291,-3.5
Cupra,2.1,1.6,20397,15418,32.3,2.1,1.4,40869,28728,42.3
Seat,1.8,2.2,16998,21884,-22.3,1.7,2.1,33575,41946,-20.0
Porsche,0.7,1.0,6267,9486,-33.9,0.7,0.9,14218,18524,-23.2
Others,0.1,0.1,613,516,18.8,0.1,0.1,1346,1178,14.3
Stellantis,16.2,18.7,155970,186151,-16.2,15.8,18.4,310091,369469,-16.1
Peugeot,5.8,5.8,56016,58094,-3.6,5.7,5.7,111443,115484,-3.5


In [16]:
df.to_csv('samples/acea_2_202502_raw.csv', encoding='CP932', header=False)

<hr width=40%/>

In [17]:
# housekeeping...
for f in client.files.list():
    client.files.delete(name=f.name)