# Google's Gemini 2.0 

## ... ??

In [None]:
!ls -la samples

## Set up environment

## Get a Gemini API key

Go to the [Get API Key page at Google AI Studio](https://aistudio.google.com/app/apikey) to create an API key.

Add this into your development environment in a safe manner, being sure never to save it in plaintext in your code or on your filesystem. For example, you could save your Gemini API key as an environment variable in your shell.

In [1]:
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

----

## Instantiate a client to Gemini

The client instance pointed at Gemini can be used to:
1. manage files for input/output ([`upload`](https://googleapis.github.io/python-genai/genai.html#genai.files.Files.upload), [`list`](https://googleapis.github.io/python-genai/genai.html#genai.files.AsyncFiles.list), [`delete`](https://googleapis.github.io/python-genai/genai.html#genai.files.AsyncFiles.delete), etc.)
2. invoke calls to the Gemini model ([`count_tokens`](https://googleapis.github.io/python-genai/genai.html#genai.models.AsyncModels.count_tokens), [`generate_content`](https://googleapis.github.io/python-genai/genai.html#genai.models.AsyncModels.generate_content))

In [2]:
from google import genai

# Create a client
client = genai.Client(api_key=GEMINI_API_KEY)

# Define the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

#### Prepare/upload PDFs

#### Listing all files uploaded via the `files` API

#### Deleting all files uploaded via the `files` API

----

In [16]:
from pydantic import BaseModel, Field

class SMTidyRow(BaseModel):
    """ Model for a tidy row of the tabular data in the Saint-marc HD PDF """

    year: str = Field(description="This field is the yyyy value from the 年度 column")

    month: str = Field(description=(
        "This field may takes one of 15 values from the column headers: "
        "1月, 2月, 3月, 4月, 5月, 6月, 7月, 8月, 9月, 10月, 11月, 12月, 下期, 上期, 通期."
    ))
    #month: str = Field(description=(
    #    "This field may takes one of 15 values from the column headers: "
    #    "1月, 2月, 3月, 4月, 5月, 6月, 7月, 8月, 9月, 10月, 11月, 12月, 下期, 上期, 通期. "
    #    "Please remove any 月 characters. Also please replace 下半期 with 下期, and 上半期 with 上期"
    #))

    cat1: str = Field(description="This field is always 月次売上情報")
    
    cat2: str = Field(description="This field is always 昨年対比")
    
    cat3: str = Field(description="This field has a value that is either 全店 or 既存店"
                     )
    value: str = Field(description=(
        "This field holds the table cell value, which is a floating point number "
        "with exactly one decimal place, or it is a whitespace indicating a blank value"
    ))
    #value: str = Field(description=(
    #    "This field holds the table cell value, which is a floating point number "
    #    "with exactly one decimal place. Please divide this by 100.00, "
    #    "and show this new floating point to 3 decimal places."
    #))

class SMTidyData(BaseModel):
    data: list[SMTidyRow] = Field(description="The list of SMTidyRow with year, month, cat1, cat2, cat3, and value fields")

----

In [35]:
def extract_structured_data(file_path: str, model: BaseModel):
    
    # Upload the file to the File API
    file = client.files.upload(
        file=file_path, 
        config={
            'mime_type': 'application/pdf',
            'display_name': file_path.split('/')[-1].split('.')[0]
    })
    
    # Generate a structured response using the Gemini API
    prompt = f"Extract the table from page 3 in the following PDF file in ACEATidyData format"
    response = client.models.generate_content(
        model=model_id, 
        contents=[prompt, file], 
        config={
            'response_mime_type': 'application/json', 
            'response_schema': model
    })

    # now that we're finished, let's delete that file
    client.files.delete(name=file.name)
    
    # Convert the response to the pydantic model and return it
    return response.parsed

In [36]:
import pandas as pd

![Saint-marc HD PDF for 2025-Feb 月次売上情報](samples/saintmarc-hd_20250213.pdf.png "Saint-marc HD PDF for 2025-Feb 月次売上情報")

In [37]:
%%time

# let's try the Saint-marc HD 20250213 PDF: this is the relatively clean table
# ... text 
tidy_data = extract_structured_data('samples/saintmarc-hd_20250213.pdf', SMTidyData)

smhd_202502_df = pd.DataFrame(tidy_data.model_dump()['data'])

CPU times: user 18.8 ms, sys: 3.64 ms, total: 22.5 ms
Wall time: 27.7 s


In [38]:
# we expect 84 - 6 = 78 values in the tidy data
print(smhd_202502_df.shape)
print(f"\n{smhd_202502_df.iloc[28:43]}")

(78, 6)

    year month    cat1  cat2 cat3  value
28  2024    4月  月次売上情報  昨年対比   全店  102.6
29  2024    5月  月次売上情報  昨年対比   全店  102.4
30  2024    6月  月次売上情報  昨年対比   全店  109.9
31  2024    7月  月次売上情報  昨年対比   全店  100.7
32  2024    8月  月次売上情報  昨年対比   全店  106.6
33  2024    9月  月次売上情報  昨年対比   全店  105.6
34  2024   上半期  月次売上情報  昨年対比   全店  104.6
35  2024   10月  月次売上情報  昨年対比   全店   98.8
36  2024   11月  月次売上情報  昨年対比   全店  104.5
37  2024   12月  月次売上情報  昨年対比   全店  101.8
38  2024    1月  月次売上情報  昨年対比   全店  101.2
39  2022    4月  月次売上情報  昨年対比  既存店  115.1
40  2022    5月  月次売上情報  昨年対比  既存店  126.0
41  2022    6月  月次売上情報  昨年対比  既存店  122.8
42  2022    7月  月次売上情報  昨年対比  既存店  111.5


<hr width=40%/>

![Saint-marc HD PDF for 2025-Mar 月次売上情報](samples/saintmarc-hd_20250313.pdf.png "Saint-marc HD PDF for 2025-Mar 月次売上情報")

In [39]:
%%time

# let's try the Saint-marc HD 20250313 PDF: this is the PDF created from an image file
# ... image file, no text!
tidy_data = extract_structured_data('samples/saintmarc-hd_20250313.pdf', SMTidyData)

smhd_202503_df = pd.DataFrame(tidy_data.model_dump()['data'])

CPU times: user 15.6 ms, sys: 848 μs, total: 16.5 ms
Wall time: 26.8 s


In [40]:
# we expect 84 - 4 = 80 values in the tidy data this time
print(smhd_202503_df.shape)
print(f"\n{smhd_202503_df.iloc[68:]}")

(80, 6)

    year month    cat1  cat2 cat3  value
68  2024    4月  月次売上情報  昨年対比  既存店  107.1
69  2024    5月  月次売上情報  昨年対比  既存店  106.3
70  2024    6月  月次売上情報  昨年対比  既存店  113.9
71  2024    7月  月次売上情報  昨年対比  既存店  105.2
72  2024    8月  月次売上情報  昨年対比  既存店  110.5
73  2024    9月  月次売上情報  昨年対比  既存店  110.3
74  2024   上半期  月次売上情報  昨年対比  既存店  108.8
75  2024   10月  月次売上情報  昨年対比  既存店  103.8
76  2024   11月  月次売上情報  昨年対比  既存店  109.2
77  2024   12月  月次売上情報  昨年対比  既存店  104.9
78  2024    1月  月次売上情報  昨年対比  既存店  104.0
79  2024    2月  月次売上情報  昨年対比  既存店  104.6


<hr width=40%/>

![ACEA Press Release, 2025-Feb](samples/Press_release_car_registrations_February_2025.pdf.png "ACEA Press Release, 2025-Feb")

In [41]:
class ACEATidyRow(BaseModel):
    """ Model for a tidy row of the tabular data in the ACEA PDF """

    year: str = Field(description="This field is the yyyy value of either the current year or the previous year")

    month: str = Field(description="This current month of the current year; or the current month of the previous year; or the % change from current/previous")

    cat1: str = Field(description="This field is one of: BATTERY ELECTRIC, PLUG-IN HYBRID, HYBRID ELECTRIC, OTHERS, PETROL, DIESEL, TOTAL")
    
    cat2: str = Field(description="This field is the country/region that is listed in the row headers")

    value: str = Field(description="This field holds the table cell value, which is a numeric value")


class ACEATidyData(BaseModel):
    data: list[ACEATidyRow] = Field(description="The list of ACEATidyRow with year, month, cat1, cat2, and value fields")

In [42]:
%%time

# let's try the ACEA 2025-Feb PDF
# ... can the model see that there are no values for Romania for PLUG-IN HYBRID???
tidy_data = extract_structured_data('samples/Press_release_car_registrations_February_2025.pdf', ACEATidyData)

acea_202502_df = pd.DataFrame(tidy_data.model_dump()['data'])

AttributeError: 'NoneType' object has no attribute 'model_dump'

In [31]:
# we expect ?? values in the tidy data 
print(acea_202502_df.shape)


NameError: name 'acea_202502_df' is not defined

In [32]:
tidy_data

----

## 4. Extract Structured data from PDFs using Gemini 2.0

Now, let's combine the File API and structured output to extract information from our PDFs. You can create a simple method that accepts a local file path and a pydantic model and return the structured data for us. The method will:

1. Upload the file to the File API
2. Generate a structured response using the Gemini API
3. Convert the response to the pydantic model and return it

In [None]:
from pydantic import BaseModel, Field

class Datum(BaseModel):
    cat1: str = Field(description="This field has a value that is either 全店売上 or 既存店売上")
    cat2: str = Field(description="This field is always 昨年対比")
    cat3: str = Field(description="This field is the yyyy value for the 年度 column")
    cat4: str = Field(description="This field is takes one of 14 values: 4月, 5月, 6月, 7月, 8月, 9月, 上半期, 10月, 11月, 12月, 1月, 2月, 3月, 通期")
    value: str = Field(description="This field holds the table cell value, which is a floating point number with exactly one decimal place, or it is a whitespace indicating a blank value")

class TidyData(BaseModel):
    name: str = Field(description="This field is always 月次売上情報")
    data: list[Datum] = Field(description="The list of Datum with cat1, cat2, cat3, cat4, cat5, and value fields")

In [None]:
%%time

# let's try the Saint-marc HD 20250213 PDF: this is the well-structured table
# ... text 
prompt = f"Extract the table from the following PDF file, in TidyData format"

response = client.models.generate_content(
    model=model_id, 
    contents=[prompt, saintmarc_hd_20250213], 
    config={'response_mime_type': 'application/json', 'response_schema': TidyData}
)

tidy_data = response.parsed

smhd_202502_df = pd.DataFrame(tidy_data.model_dump()['data'])
# we expect 84 - 6 = 78 values in the tidy data
print(smhd_202502_df.shape)

----

In [None]:
%%time

# let's try the Saint-marc HD 20250313 PDF: this is the borked table (?)
# ... image file
prompt = f"Extract the table from the following PDF file, in TidyData format"

response = client.models.generate_content(
    model=model_id, 
    contents=[prompt, saintmarc_hd_20250313], 
    config={'response_mime_type': 'application/json', 'response_schema': TidyData}
)

tidy_data = response.parsed

smhd_202503_df = pd.DataFrame(tidy_data.model_dump()['data'])
# we expect 84 - 4 = 80 values in the tidy data
print(smhd_202503_df.shape)

----

In [None]:
%%time

# Lastly we try that ACEA February PDF... 
# ... keep an eye out on the 3 "missing" Plug-in Hybrid values in the Romania data
prompt = f"Extract the table from the following PDF file, in TidyData format"

response = client.models.generate_content(
    model=model_id, 
    contents=[prompt, saintmarc_hd_20250313], 
    config={'response_mime_type': 'application/json', 'response_schema': TidyData}
)

tidy_data = response.parsed

smhd_202503_df = pd.DataFrame(tidy_data.model_dump()['data'])
# we expect 84 - 4 = 80 values in the tidy data
print(smhd_202503_df.shape)