# Google's Gemini 2.0 

## ... this actually comes prior to extracting structured data??

In [1]:
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

In [2]:
from google import genai

# Create a client
client = genai.Client(api_key=GEMINI_API_KEY)

# Define the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

In [3]:
saintmarc_hd_20250213 = client.files.upload(
    file="samples/saintmarc-hd_20250213.pdf", 
    config={
        'mime_type': 'application/pdf',
        'display_name': 'saintmarc-hd_20250213.pdf'
    }
)

saintmarc_hd_20250313 = client.files.upload(
    file="samples/saintmarc-hd_20250313.pdf", 
    config={
        'mime_type': 'application/pdf',
        'display_name': 'saintmarc-hd_20250313.pdf'
    }
)

acea_202502 = client.files.upload(
    file="samples/Press_release_car_registrations_February_2025.pdf", 
    config={
        'mime_type': 'application/pdf',
        'display_name': 'Press_release_car_registrations_February_2025.pdf'
    }
)

for uploaded_file in (saintmarc_hd_20250213, saintmarc_hd_20250313, acea_202502):
    file_size = client.models.count_tokens(
        model=model_id,
        contents=uploaded_file
    )

    print(f'File: {uploaded_file.display_name} contains {file_size.total_tokens} tokens')

File: saintmarc-hd_20250213.pdf contains 259 tokens
File: saintmarc-hd_20250313.pdf contains 259 tokens
File: Press_release_car_registrations_February_2025.pdf contains 1549 tokens


----

![ACEA Press Release, 2025-Feb](samples/Press_release_car_registrations_February_2025.pdf.png "ACEA Press Release, 2025-Feb")

In [4]:
%%time

prompt="""
    You are an expert at locating tabular data in PDFs and extracting the exact information from all parts of a table, 
    such as the row and column headers, and table cells. You always understand the layout of a table, and know how to return empty values.

    Extract the entire contents of the NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE, MONTHLY table in the given PDF.

    The column headers are 2-level.
    The top-most column header is one of: BATTERY ELECTRIC, PLUG-IN HYBRID, HYBRID ELECTRIC, OTHERS, PETROL, DIESEL and TOTAL
    The second-level column headers comprise: current month for current year, current month for previous year, and % change year-on-year.

    Make sure that for the rows that start with a country or geopolitical entity, there are exactly 21 values.

    For example, a section of three empty values might look like |   |   |   |
"""

response = client.models.generate_content(
    model=model_id,
    contents=[acea_202502, prompt],
    config=genai.types.GenerateContentConfig(
        seed=777
    )
)

print(response.text)
print()

Okay, I understand. Here's the extracted data from the "NEW CAR REGISTRATIONS BY MARKET AND POWER SOURCE, MONTHLY" table in the PDF, with the column structure and data as requested.

|   | **BATTERY ELECTRIC** |   |   | **PLUG-IN HYBRID** |   |   | **HYBRID ELECTRIC¹** |   |   | **OTHERS²** |   |   | **PETROL** |   |   | **DIESEL** |   |   | **TOTAL** |   |   |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|   | February 2025 | February 2024 | % change 25/24 | February 2025 | February 2024 | % change 25/24 | February 2025 | February 2024 | % change 25/24 | February 2025 | February 2024 | % change 25/24 | February 2025 | February 2024 | % change 25/24 | February 2025 | February 2024 | % change 25/24 | February 2025 | February 2024 | % change 25/24 |
| Austria | 4,233 | 3,322 | +27.4 | 1,613 | 1,335 | +20.8 | 5,549 | 4,691 | +18.3 | 0 | 0 |   | 5,736 | 6,527 | -12.1 | 2,488 | 4,135 | -39.8 | 19,619 | 20,010 | -2.0 |
| Belgium | 13,040 | 9,385 |

----

In [5]:
import pandas as pd

# filter out any lines that do not pertain to the markdown-flavored table rows...
lines = [l for l in response.text.split('\n') if '|' in l.strip()]

# gather the data into a DataFrame...
acc = []
for l in lines[3:]:
    acc.append([v.strip() for v in l.split('|')][1:-1])
df = pd.DataFrame(acc)
#df

# build up the row headers (country / geopolitical entity)...
row_headers = df.iloc[:,0].values.tolist()
#row_headers

# build up the 2-level column headers...
col_level_0 = [v.strip() for v in lines[0].split('|') if len(v.strip()) > 0]
col_level_0 = [[v]*3 for v in col_level_0]
col_level_0 = [e for sublist in col_level_0 for e in sublist]
#col_level_0

col_level_1 = [v.strip() for v in lines[2].split('|') if len(v.strip()) > 0][:3]*7
#col_level_1

col_multi = pd.MultiIndex.from_tuples(zip(col_level_0, col_level_1))

# finally, create a nice & clean copy of the target table!
df = df.iloc[:,1:].copy()
df.index = row_headers
df.columns = col_multi

In [6]:
df.head()

Unnamed: 0_level_0,**BATTERY ELECTRIC**,**BATTERY ELECTRIC**,**BATTERY ELECTRIC**,**PLUG-IN HYBRID**,**PLUG-IN HYBRID**,**PLUG-IN HYBRID**,**HYBRID ELECTRIC¹**,**HYBRID ELECTRIC¹**,**HYBRID ELECTRIC¹**,**OTHERS²**,**OTHERS²**,**OTHERS²**,**PETROL**,**PETROL**,**PETROL**,**DIESEL**,**DIESEL**,**DIESEL**,**TOTAL**,**TOTAL**,**TOTAL**
Unnamed: 0_level_1,February 2025,February 2024,% change 25/24,February 2025,February 2024,% change 25/24,February 2025,February 2024,% change 25/24,February 2025,...,% change 25/24,February 2025,February 2024,% change 25/24,February 2025,February 2024,% change 25/24,February 2025,February 2024,% change 25/24
Austria,4233,3322,27.4,1613,1335,20.8,5549,4691,18.3,0,...,,5736,6527,-12.1,2488,4135,-39.8,19619,20010,-2.0
Belgium,13040,9385,38.9,3070,8385,-63.4,5383,4282,25.7,267,...,-35.7,17280,18918,-8.7,1121,2337,-52.0,40161,43722,-8.1
Bulgaria,126,122,3.3,34,31,9.7,105,73,43.8,0,...,,2781,2868,-3.0,348,510,-31.8,3394,3604,-5.8
Croatia,53,50,6.0,140,94,48.9,1629,1455,12.0,101,...,-8.2,1644,1898,-13.4,678,923,-26.5,4245,4530,-6.3
Cyprus,107,105,1.9,78,48,62.5,579,609,-4.9,0,...,,456,712,-36.0,64,27,137.0,1284,1501,-14.5


In [7]:
#df.loc['Austria']
#df.loc['Bulgaria']
#df.loc['Cyprus']
#df.loc['Denmark']
#df.loc['Ireland']
#df.loc['latvia']
#df.loc['Luxembourg']
#df.loc['Malta']
df.loc['Romania']
#df.loc['Iceland']
#df.loc['Norway']
#df.loc['United Kingdom']

**BATTERY ELECTRIC**  February 2025        724
                      February 2024      1,109
                      % change 25/24     -34.7
**PLUG-IN HYBRID**    February 2025           
                      February 2024           
                      % change 25/24          
**HYBRID ELECTRIC¹**  February 2025      5,510
                      February 2024      3,736
                      % change 25/24     +47.5
**OTHERS²**           February 2025      1,354
                      February 2024        953
                      % change 25/24     +42.1
**PETROL**            February 2025      3,007
                      February 2024      3,729
                      % change 25/24     -19.4
**DIESEL**            February 2025      1,255
                      February 2024      1,812
                      % change 25/24     -30.7
**TOTAL**             February 2025     11,850
                      February 2024     11,339
                      % change 25/24      +4.5
Name: Romania

----

In [8]:
import pandas as pd

In [9]:
from pydantic import BaseModel, Field

class TidyRow(BaseModel):
    """ Model for a tidy row of the tabular data in the Saint-marc HD PDF """

    year: str = Field(description="This field is the yyyy value from the 年度 column")

    month: str = Field(description=(
        "This field may takes one of 15 values from the column headers: "
        "1月, 2月, 3月, 4月, 5月, 6月, 7月, 8月, 9月, 10月, 11月, 12月, 下期, 上期, 通期."
    ))
    #month: str = Field(description=(
    #    "This field may takes one of 15 values from the column headers: "
    #    "1月, 2月, 3月, 4月, 5月, 6月, 7月, 8月, 9月, 10月, 11月, 12月, 下期, 上期, 通期. "
    #    "Please remove any 月 characters. Also please replace 下半期 with 下期, and 上半期 with 上期"
    #))

    cat1: str = Field(description="This field is always 月次売上情報")
    
    cat2: str = Field(description="This field is always 昨年対比")
    
    cat3: str = Field(description="This field has a value that is either 全店 or 既存店")
    
    value: str = Field(description=(
        "This field holds the table cell value, which is a floating point number "
        "with exactly one decimal place, or it is a whitespace indicating a blank value"
    ))
    #value: str = Field(description=(
    #    "This field holds the table cell value, which is a floating point number "
    #    "with exactly one decimal place. Please divide this by 100.00, "
    #    "and show this new floating point to 3 decimal places."
    #))

class TidyData(BaseModel):
    data: list[TidyRow] = Field(description="The list of TidyRow with year, month, cat1, cat2, cat3, and value fields")

In [10]:
def extract_structured_data(the_file: genai.types.File, model: BaseModel):
    
    # Generate a structured response using the Gemini API
    prompt = f"Extract the table from page 3 in the following PDF file in ACEATidyData format"
    response = client.models.generate_content(
        model=model_id, 
        contents=[prompt, the_file], 
        config={
            'response_mime_type': 'application/json', 
            'response_schema': model
    })
    
    # Convert the response to the pydantic model and return it
    return response.parsed

<hr width=40%/>

![Saint-marc HD PDF for 2025-Feb 月次売上情報](samples/saintmarc-hd_20250213.pdf.png "Saint-marc HD PDF for 2025-Feb 月次売上情報")

In [11]:
%%time

# let's try the Saint-marc HD 20250213 PDF: this is the relatively clean table
# ... text 
tidy_data = extract_structured_data(saintmarc_hd_20250213, TidyData)

smhd_202502_df = pd.DataFrame(tidy_data.model_dump()['data'])

CPU times: user 10.7 ms, sys: 0 ns, total: 10.7 ms
Wall time: 24.7 s


In [12]:
# we expect 84 - 6 = 78 values in the tidy data
print(smhd_202502_df.shape)
print(f"\n{smhd_202502_df.iloc[28:43]}")

(78, 6)

    year month    cat1  cat2 cat3  value
28  2024    4月  月次売上情報  昨年対比   全店  102.6
29  2024    5月  月次売上情報  昨年対比   全店  102.4
30  2024    6月  月次売上情報  昨年対比   全店  109.9
31  2024    7月  月次売上情報  昨年対比   全店  100.7
32  2024    8月  月次売上情報  昨年対比   全店  106.6
33  2024    9月  月次売上情報  昨年対比   全店  105.6
34  2024   上半期  月次売上情報  昨年対比   全店  104.6
35  2024   10月  月次売上情報  昨年対比   全店   98.8
36  2024   11月  月次売上情報  昨年対比   全店  104.5
37  2024   12月  月次売上情報  昨年対比   全店  101.8
38  2024    1月  月次売上情報  昨年対比   全店  101.2
39  2022    4月  月次売上情報  昨年対比  既存店  115.1
40  2022    5月  月次売上情報  昨年対比  既存店  126.0
41  2022    6月  月次売上情報  昨年対比  既存店  122.8
42  2022    7月  月次売上情報  昨年対比  既存店  111.5


<hr width=40%/>

![Saint-marc HD PDF for 2025-Mar 月次売上情報](samples/saintmarc-hd_20250313.pdf.png "Saint-marc HD PDF for 2025-Mar 月次売上情報")

In [13]:
%%time

# let's try the Saint-marc HD 20250313 PDF: this is the PDF created from an image file
# ... image file, no text!
tidy_data = extract_structured_data(saintmarc_hd_20250313, TidyData)

smhd_202503_df = pd.DataFrame(tidy_data.model_dump()['data'])

CPU times: user 4.81 ms, sys: 3.87 ms, total: 8.68 ms
Wall time: 25.5 s


In [14]:
# we expect 84 - 4 = 80 values in the tidy data this time
print(smhd_202503_df.shape)
print(f"\n{smhd_202503_df.iloc[68:]}")

(80, 6)

    year month    cat1  cat2 cat3  value
68  2024    4月  月次売上情報  昨年対比  既存店  107.1
69  2024    5月  月次売上情報  昨年対比  既存店  106.3
70  2024    6月  月次売上情報  昨年対比  既存店  113.9
71  2024    7月  月次売上情報  昨年対比  既存店  105.2
72  2024    8月  月次売上情報  昨年対比  既存店  110.5
73  2024    9月  月次売上情報  昨年対比  既存店  110.3
74  2024   上半期  月次売上情報  昨年対比  既存店  108.8
75  2024   10月  月次売上情報  昨年対比  既存店  103.8
76  2024   11月  月次売上情報  昨年対比  既存店  109.2
77  2024   12月  月次売上情報  昨年対比  既存店  104.9
78  2024    1月  月次売上情報  昨年対比  既存店  104.0
79  2024    2月  月次売上情報  昨年対比  既存店  104.6


----

In [15]:
for f in client.files.list():
    client.files.delete(name=f.name)