# Google's Gemini 2.0 

## ... this actually comes prior to extracting structured data??

In [1]:
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

In [2]:
from pydantic import BaseModel, Field

In [3]:
import pandas as pd

In [4]:
from google import genai

# Create a client
client = genai.Client(api_key=GEMINI_API_KEY)

# Define the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

In [5]:
for f in client.files.list():
    client.files.delete(name=f.name)

In [6]:
saintmarc_hd_20250213 = client.files.upload(
    file="samples/saintmarc-hd_20250213.pdf", 
    config={
        'mime_type': 'application/pdf',
        'display_name': 'saintmarc-hd_20250213.pdf'
    }
)

saintmarc_hd_20250313 = client.files.upload(
    file="samples/saintmarc-hd_20250313.pdf", 
    config={
        'mime_type': 'application/pdf',
        'display_name': 'saintmarc-hd_20250313.pdf'
    }
)

acea_202502 = client.files.upload(
    file="samples/Press_release_car_registrations_February_2025.pdf", 
    config={
        'mime_type': 'application/pdf',
        'display_name': 'Press_release_car_registrations_February_2025.pdf'
    }
)

for uploaded_file in (saintmarc_hd_20250213, saintmarc_hd_20250313, acea_202502):
    file_size = client.models.count_tokens(
        model=model_id,
        contents=uploaded_file
    )

    print(f'File: {uploaded_file.display_name} contains {file_size.total_tokens} tokens')

File: saintmarc-hd_20250213.pdf contains 259 tokens
File: saintmarc-hd_20250313.pdf contains 259 tokens
File: Press_release_car_registrations_February_2025.pdf contains 1549 tokens


### Function for calling the Google Gen AI model

In [7]:
def extract_structured_data(model_id:str, prompt:str, the_file:genai.types.File, response_schema:BaseModel):
    """Given the Google Gen AI model,
       a prompt for the model,
       a file from which we will extract information,
       and a schema for the structured response we wish to obtain:

       Return a structured response for the given prompt.
    """
    response = client.models.generate_content(
        model=model_id,
        contents=[prompt, the_file], 
        config={
            'response_mime_type': 'application/json', 
            'response_schema': response_schema
    })
    
    # Convert the response to the Pydantic model and return it
    return response.parsed

----

### ACEA Press Release, 2025-Feb

![ACEA Press Release, 2025-Feb](samples/Press_release_car_registrations_February_2025.pdf.png "ACEA Press Release, 2025-Feb")

### Pydantic models describing the response structure

In [8]:
class Table1Row(BaseModel):
    name: str = Field(description="This field is the name of the country or geopolitical entity in the header of each table row")

    values: list[str] = Field(description=(
        "This is a list of cell values making up a single row in the table. Represent a missing value with ??"
    ))

class Table1(BaseModel):
    data: list[Table1Row] = Field(description=(
        "This is a list of the Table1Row objects that make up a row of data in the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE MONTHLY table."
    ))    

In [9]:
%%time

prompt="""
You are an expert at locating tabular data in PDFs and extracting the exact information from all parts of a table, 
such as the row and column headers, and table cells. You always understand the layout of a table, and know how to return empty values.

Extract all columns for major column categories of BATTERY ELECTRIC, PLUG-IN HYBRID, HYBRID ELECTRIC, OTHERS, PETROL, DIESEL, and TOTAL in the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE, MONTHLY table in the given PDF.
For each major column category, you must extract 3 values for: month and current year, month and previous year, and % change year-on-year.
Scan each table row from left to right, representing all missing cell values with double question marks.
Represent missing % change year-on-year values with double question marks.

""".strip()

tidy_data = extract_structured_data(model_id, prompt, acea_202502, Table1)

#for row in tidy_data.model_dump()['data']:
#    print(row, len(row['values']))

CPU times: user 12.3 ms, sys: 4.06 ms, total: 16.3 ms
Wall time: 30.4 s


In [10]:
row_headers, data = [], []
for row in tidy_data.model_dump()['data']:
    row_headers.append(row['name'])
    data.append(row['values'])

df = pd.DataFrame(data, index=row_headers)
df.replace('??', '', inplace=True)

print(df.shape)
print()
df

(34, 21)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
Austria,4233.0,3322,27.4,1613.0,1335.0,20.8,5549,4691,18.3,0,...,,5736,6527,-12.1,2488,4135.0,-39.8,19619,20010,-2.0
Belgium,13040.0,9385,38.9,3070.0,8385.0,-63.4,5383,4282,25.7,267,...,-35.7,17280,18918,-8.7,1121,2337.0,-52.0,40161,43722,-8.1
Bulgaria,126.0,122,3.3,34.0,31.0,9.7,105,73,43.8,0,...,,2781,2868,-3.0,348,510.0,-31.8,3394,3604,-5.8
Croatia,53.0,50,6.0,140.0,94.0,48.9,1629,1455,12.0,101,...,-8.2,1644,1898,-13.4,678,923.0,-26.5,4245,4530,-6.3
Cyprus,107.0,105,1.9,78.0,48.0,62.5,579,609,-4.9,0,...,,456,712,-36.0,64,27.0,137.0,1284,1501,-14.5
Czechia,737.0,438,68.3,557.0,450.0,23.8,3634,3577,1.6,470,...,-18.8,8844,9723,-9.0,3531,3561.0,-0.8,17773,18328,-3.0
Denmark,7724.0,4974,55.3,312.0,525.0,-40.6,1453,1941,-25.1,0,...,,1908,3415,-44.1,220,363.0,-39.4,11617,11218,3.6
Estonia,59.0,89,-33.7,69.0,59.0,16.9,307,609,-49.6,0,...,-100.0,132,430,-69.3,57,244.0,-76.6,624,1440,-56.7
Finland,1563.0,1330,17.5,1035.0,1237.0,-16.3,1398,1735,-19.4,0,...,-100.0,598,736,-18.8,190,297.0,-36.0,4784,5374,-11.0
France,25.335,25825,-1.9,6451.0,11732.0,-45.0,62146,41227,50.7,5821,...,5.9,35110,48095,-27.0,6707,10221.0,-34.4,141570,142595,-0.7


In [12]:
df.loc[['Romania']].T

Unnamed: 0,Romania
0,724.0
1,1109.0
2,-34.7
3,
4,
5,
6,5510.0
7,3736.0
8,47.5
9,1354.0


In [None]:
df.to_csv('samples/acea_1_202502_raw.csv', encoding='CP932', header=False)

In [None]:
df.index

----

In [None]:
class ACEARow2(BaseModel):
    name: str = Field(description="This field is the name of the automanufacturer in the header of each table row")

    values: list[str] = Field(description=(
        "This is a list of cell values making up a single row in the table. Represent a missing value with ??"
    ))

class ACEATable2(BaseModel):
    data: list[ACEARow2] = Field(description=(
        "This is a list of the ACEARow2 objects"
    ))    

In [None]:
%%time

prompt="""
You are an expert at locating tabular data in PDFs and extracting the exact information from all parts of a table, 
such as the row and column headers, and table cells. You always understand the layout of a table, and know how to return empty values.

Extract all columns for major column categories of FEBRUARY and JANUARY-FEBRUARY in the table on page 5 in the given PDF.
For each major column category, you must extract values for each sub-categories: % share 2025 and  % share 2024; Units 2025 and Units 2024; and % change 25/24.
There will be exactly 10 values per row.
Scan each table row from left to right, representing all missing cell values with double question marks.
Represent missing % change year-on-year values with double question marks.

""".strip()

tidy_data = extract_structured_data(model_id, prompt, acea_202502, ACEATable2)

In [None]:
tidy_data.model_dump()

<hr width=40%/>

#### Gemini 2.0 Flash with only a prompt

In [None]:
%%time

prompt="""
    You are an expert at locating tabular data in PDFs and extracting the exact information from all parts of a table, 
    such as the row and column headers, and table cells. You always understand the layout of a table, and know how to return empty values.

    Extract the entire contents of the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE, MONTHLY table on page 3 in the given PDF.
    Extrace the table contents as Markdown.

    Instructions for the column headers:
    - The column headers are NOT required. Do NOT return them.

    Instructions for the row headers:
    - Make sure that for the rows that start with a country or geopolitical entity.
    
    Instructions for the table cells:
    - In each row for a country or geopolitical entity, there are exactly 21 values + 1 for the row header.
    - Therefore, each table row is represented with exactly 23 | characters.
    - If a cell value is 0, then please return 0.
    - If a value is missing or empty, please represent it with an empty string.
    - For example, a section of three empty values might look like |   |   |   |
""" 

response = client.models.generate_content(
    model=model_id,
    contents=[acea_202502, prompt]
)

print(response.text)
print()

In [None]:
dir(response)


In [None]:
response.model_dump()

In [None]:
%%time

prompt="""
    You are an expert at locating tabular data in PDFs and extracting the exact information from all parts of a table, 
    such as the row and column headers, and table cells. You always understand the layout of a table, and know how to return empty values.

    Extract the entire contents of the NEW CAR REGISTRATIONS BY MANUFACTURER, EU + EFTA + UK table on the last page in the given PDF.
    Extrace the table contents as Markdown.

    Instructions for the column headers:
    - The column headers are NOT required. Do NOT return them.

    Instructions for the row headers:
    - The row headers are required. Make sure that for the rows that start with the name of an automobile manufacturer, or Others.
    - Leave out an superscript characters: Others2 is just O
    
    Instructions for the table cells:
    - In each row for an automobile manufacturer, there are exactly 10 values.
    - You must accurately extract the numerical value. NO MISTAKES!
""" 

response = client.models.generate_content(
    model=model_id,
    contents=[acea_202502, prompt]
)

print(response.text)
print()

----

### Gemini 2.0 Flash with prompt + structured response instruction: Tidy data formatting

Extracting the table from the Saint-Marc Holdings PDF.

In [None]:
class TidyRow(BaseModel):
    """ Model for a tidy row of the tabular data in the Saint-marc HD PDF """

    year: str = Field(description="This field is the yyyy value from the 年度 column")

    month: str = Field(description=(
        "This field may takes one of 15 values from the column headers: "
        "1月, 2月, 3月, 4月, 5月, 6月, 7月, 8月, 9月, 10月, 11月, 12月, 下期, 上期, 通期."
    ))
    #month: str = Field(description=(
    #    "This field may takes one of 15 values from the column headers: "
    #    "1月, 2月, 3月, 4月, 5月, 6月, 7月, 8月, 9月, 10月, 11月, 12月, 下期, 上期, 通期. "
    #    "Please remove any 月 characters. Also please replace 下半期 with 下期, and 上半期 with 上期"
    #))

    cat1: str = Field(description="This field is always 月次売上情報")
    
    cat2: str = Field(description="This field is always 昨年対比")
    
    cat3: str = Field(description="This field has a value that is either 全店 or 既存店")
    
    value: str = Field(description=(
        "This field holds the table cell value, which is a floating point number "
        "with exactly one decimal place, or it is a whitespace indicating a blank value"
    ))
    #value: str = Field(description=(
    #    "This field holds the table cell value, which is a floating point number "
    #    "with exactly one decimal place. Please divide this by 100.00, "
    #    "and show this new floating point to 3 decimal places."
    #))

class TidyData(BaseModel):
    data: list[TidyRow] = Field(description="The list of TidyRow with year, month, cat1, cat2, cat3, and value fields")

<hr width=40%/>

### Saint-marc HD PDF for 2025-Jan 月次売上情報

![Saint-marc HD PDF for 2025-Jan 月次売上情報](samples/saintmarc-hd_20250213.pdf.png "Saint-marc HD PDF for 2025-Jan 月次売上情報")

In [None]:
%%time

prompt = """
Extract the 月次売上情報 table in the following PDF file in TidyData format
""".strip()

# let's try the Saint-marc HD 20250213 PDF: this is the relatively clean table
# ... text 
tidy_data = extract_structured_data(model_id, prompt, saintmarc_hd_20250213, TidyData)

smhd_202502_df = pd.DataFrame(tidy_data.model_dump()['data'])

In [None]:
# we expect 84 - 6 = 78 values in the tidy data
print(smhd_202502_df.shape)
print(f"\n{smhd_202502_df.iloc[28:43]}")

<hr width=40%/>

### Saint-marc HD PDF for 2025-Feb 月次売上情報

![Saint-marc HD PDF for 2025-Feb 月次売上情報](samples/saintmarc-hd_20250313.pdf.png "Saint-marc HD PDF for 2025-Feb 月次売上情報")

In [None]:
%%time

prompt = """
Extract the 月次売上情報 table in the following PDF file in TidyData format
""".strip()

# let's try the Saint-marc HD 20250213 PDF: this is the relatively clean table
# ... text 
tidy_data = extract_structured_data(model_id, prompt, saintmarc_hd_20250313, TidyData)

smhd_202503_df = pd.DataFrame(tidy_data.model_dump()['data'])

In [None]:
# we expect 84 - 4 = 80 values in the tidy data this time
print(smhd_202503_df.shape)
print(f"\n{smhd_202503_df.iloc[68:]}")

In [None]:



smhd_202503_df.to_csv("samples/saintmarc_202502_fixed.csv", encoding="CP932", header=True, index=False)


----