# Data Extraction facilitated by Gemini 1.5 Pro

##### Imports 

In [2]:
from vertexai.generative_models import HarmBlockThreshold
from vertexai.generative_models import GenerationConfig
from vertexai.generative_models import GenerativeModel
from vertexai.generative_models import HarmCategory
from vertexai.generative_models import Part
from vertexai.generative_models import Tool
import vertexai 
import logging 
import json 
import os 

##### Setup logging

In [3]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'Using vertexai=={vertexai.__version__}')

Using vertexai==1.52.0


##### Setup essentials 

In [5]:
PROJECT_ID = 'arun-genai-bb'
LOCATION = 'us-central1'
MODEL_NAME = 'gemini-1.5-pro-001'

In [6]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './../../credentials/key.json'
vertexai.init(project=PROJECT_ID, location=LOCATION)

##### Setup Gemini 1.5

In [7]:
with open('./../../data/templates/system_instructions.txt', 'r') as f:
    instructions = f.read()
    
system_instruction = [instructions]


In [8]:
system_instruction

['You are an expert and perfectly accurate financial data annotation analyst. You are disciplined, and feel focused and very much awake. You are writing JSON schema records about structured notes that are tied to underlying securities (sometimes one, sometimes many). These notes frequently pay coupon payments conditional on the performance of the underlying assets either at some frequency, or at the maturity of the note, or both. In addition, they often have early redemption, or Autocall, features, which cause them to pay out before the scheduled maturity date. Finally, the Note may also have an at expiration condition that can be expressed as a set of option payments. The pdf provided in the context is the prospectus of a particular Note, which you are recording the necessary information from in order to price that structured note. While a human will also review your results, historically, you are more accurate, careful, and complete than the human\'s in the analysis loop.\n\n# Instru

In [11]:
model = GenerativeModel(MODEL_NAME,
                        system_instruction=system_instruction, 
                        tools='code_execution')
model.__dict__

TypeError: Unexpected tool type: c.

##### Setup generation configs 

For deterministic output, we can set low values for top k, such as 1 or 2. We can also set a very low value for top p, close to 0. A low temperature near 0 will also ensure the responses are the most probable and deterministic.

In [None]:
generation_config = GenerationConfig(temperature=0.0, 
                                     top_p=0.0, 
                                     top_k=1, 
                                     candidate_count=1, 
                                     max_output_tokens=8192,
                                     response_mime_type="application/json")
generation_config.__dict__

##### Set safety settings 

In [None]:
safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE
}

## Kickstart data extraction

##### Load file bytes

In [None]:
with open('./../../data/test_doc_ice.pdf', "rb") as f:
    pdf_bytes = f.read()

pdf_parts = Part.from_data(data=pdf_bytes, mime_type='application/pdf')
pdf_parts

In [None]:
user_prompt = "Extract data based on provided instructions from the PDF given to you and return output in the exact asked format."

In [None]:
contents = [pdf_parts, user_prompt]

In [None]:
model.count_tokens(contents)

In [None]:
response = model.generate_content(contents, 
                                  generation_config=generation_config, 
                                  safety_settings=safety_settings)
response

##### Decipher the output response

In [None]:
response.text.strip()

In [None]:
response.to_dict().get("usage_metadata")

In [None]:
response.candidates[0].finish_reason

In [None]:
response.candidates[0].safety_ratings

In [None]:
output_json = json.loads(response.text.strip())
print(output_json)

In [None]:
output_dir = './../../data/output'
output_file_path = os.path.join(output_dir, 'ingredients.txt')

os.makedirs(output_dir, exist_ok=True)

with open(output_file_path, 'w') as file:
    file.write(json.dumps(output_json, indent=4))