# Google's Gemini 2.0 

## Information Extraction: Tabular Data

In [None]:
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

In [None]:
import pandas as pd

from google import genai
from google.genai import types

from pydantic import BaseModel, Field

In [None]:
# Create a client
client = genai.Client(api_key=GEMINI_API_KEY)

# Specify the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

----

#### PDF ...

Since the PDF file in our example is less than 20MB, we can send its contents inline with our request to the model.

In [None]:
with open("samples/Press_release_car_registrations_February_2025.pdf", "rb") as f:
      file_bytes = f.read()

pdf_file_part = types.Part.from_bytes(
    data=file_bytes,
    mime_type='application/pdf',
)

----

### Function for calling the Google Gen AI model

* specifies JSON for the data returned
* sets the schema of the response

In [None]:
def extract_structured_data(model_id:str, prompt:str, file_part:types.Part, response_schema:BaseModel):
    """Given the Google Gen AI model,
       a prompt for the model,
       the raw bytes from a PDF file (max. 20MB!) which we send inline with our request,
       and a schema for the structured response we wish to obtain:

       Return a structured response (MIME type application/json) for the given prompt.
    """
    response = client.models.generate_content(
        model=model_id,

        config=types.GenerateContentConfig(
            system_instruction=SYSTEM_LEVEL_INSTRUCTION,
            response_mime_type='application/json', 
            response_schema=response_schema,
            seed=888
        ),
        
        contents=[
            file_part,            
            prompt,
        ]
    )
    
    # Convert the response to the Pydantic model and return it
    return response.parsed

----

In [None]:
SYSTEM_LEVEL_INSTRUCTION = (
    "You are an expert data analyst, specializing in information extraction from PDFs.  "
    "You especially enjoy parsing out tabular data, always being completely accurate when extracting table parts "
    "such as the row and column headers, and table cells. "
    "You always understand the layout of a table, and know how to return empty values."
).strip()

----

### ACEA Press Release, 2025-Feb

![ACEA Press Release, page 3, NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, 2025-Feb](samples/Press_release_car_registrations_February_2025_p3.png "NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY, page 3, ACEA Press Release, 2025-Feb")

c.f. NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY on page 3, [samples/Press_release_car_registrations_February_2025.pdf](samples/Press_release_car_registrations_February_2025.pdf)

### Pydantic models describing the response structure

#### NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY

c.f. page 3 of the PDF

In [None]:
class Table1Row(BaseModel):
    name: str = Field(description="This field is the name of the country or geopolitical entity in the header of each table row")

    values: list[str] = Field(description=(
        "This is a list of cell values making up a single row in the table. Represent a missing value with ??"
    ))

class Table1(BaseModel):
    data: list[Table1Row] = Field(description=(
        "This is a list of the Table1Row objects that make up a row of data in the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY table."
    ))    

In [None]:
%%time

prompt="""
Extract all columns for major column categories of BATTERY ELECTRIC, PLUG-IN HYBRID, HYBRID ELECTRIC, OTHERS, PETROL, DIESEL, and TOTAL in the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE, MONTHLY table in the given PDF.

For each major column category, you must extract 3 values for: current month in current year, current month in previous year, and % change current yy / previous yy.

Scan each table row from left to right, representing all missing cell values with double question marks.

Represent missing % change year-on-year values with double question marks.

There will be 34 rows and 21 columns.

DO NOT MAKE UP CELL VALUES!
""".strip()


tidy_data = extract_structured_data(model_id, prompt, pdf_file_part, Table1)

In [None]:
row_headers, data = [], []
for row in tidy_data.model_dump()['data']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

In [None]:
df.loc[[
    'Austria',
    'Bulgaria',
    'Cyprus',
    'Denmark',
    'Ireland',
    'Latvia',
    'Luxembourg', 
    'Malta',
    'Romania',    # empty values at indices 3, 4, 6!
    'Iceland', 
    'Norway',
    'United Kingdom'
]].T

In [None]:
df.to_csv('samples/acea_1_202502_raw.csv', encoding='CP932', header=False)

----

### Pydantic models describing the response structure

#### NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK

![ACEA Press Release, page 6, NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, 2025-Feb](samples/Press_release_car_registrations_February_2025_p6.png "NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK, page 6, ACEA Press Release, 2025-Feb")

c.f. NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK on page 6, [samples/Press_release_car_registrations_February_2025.pdf](samples/Press_release_car_registrations_February_2025.pdf)

In [None]:
class ACEARow2(BaseModel):
    name: str = Field(description="This field is the name of the automanufacturer group or company in the header of each table row")

    values: list[str] = Field(description=(
        "This is a list of cell values making up a single row in the table. There are exactly 10 cell values for each row. Represent a missing value with ??"
    ))

class ACEATable2(BaseModel):
    data: list[ACEARow2] = Field(description=(
        "This is a list of the ACEARow2 objects. There are exactly 46 rows in the NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK table."
    ))    

In [None]:
%%time

prompt="""
Extract all table values in the NEW CAR REGISTRATIONS BY MANUFACTURER EU + EFTA + UK table in the given PDF.

Extract all columns for major column categories of FEBRUARY and JANURARY-FEBRUARY.

For each major column category, you must extract:
- 2 values for % share 2025 and 2024
- 2 values for Units 2025 and 2024
- 1 value for % change 25/24
Scan each table row from left to right, representing all missing cell values with double question marks.

Do not return the column headers.

For the row headers, please leave out any superscript numeric characters. For example "Others^2" is simply "Others".

There will be 46 rows and 10 columns.

DO NOT MAKE UP CELL VALUES!
""".strip()

tidy_data = extract_structured_data(model_id, prompt, acea_202502, ACEATable2)

In [None]:
row_headers, data = [], []
for row in tidy_data.model_dump()['data']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

In [None]:
df.to_csv('samples/acea_2_202502_raw.csv', encoding='CP932', header=False)

<hr width=40%/>

In [None]:
# housekeeping...
for f in client.files.list():
    client.files.delete(name=f.name)

----

#### Gemini 2.0 Flash with only a prompt

----

----

### Gemini 2.0 Flash with prompt + structured response instruction: Tidy data formatting

Extracting the table from the Saint-Marc Holdings PDF.

<hr width=40%/>

### Saint-marc HD PDF for 2025-Jan 月次売上情報

![Saint-marc HD PDF for 2025-Jan 月次売上情報](samples/saintmarc-hd_20250213.pdf.png "Saint-marc HD PDF for 2025-Jan 月次売上情報")

<hr width=40%/>

### Saint-marc HD PDF for 2025-Feb 月次売上情報

![Saint-marc HD PDF for 2025-Feb 月次売上情報](samples/saintmarc-hd_20250313.pdf.png "Saint-marc HD PDF for 2025-Feb 月次売上情報")

----