In [26]:
    import os
    import base64
    import vertexai
    from vertexai.generative_models import GenerativeModel, Part
    import vertexai.preview.generative_models as generative_models
    from google.cloud import storage
    from google.cloud import aiplatform

    import json
    import re



In [27]:

project_id = "ds-ml-pod"
location = "us-central1"
bucket_name = "sustainable-ai"


# Paths
train_folder = 'documents2/'

aiplatform.init(project=project_id, location=location)
vertexai.init(project=project_id, location=location)

In [29]:
# Read example ip and op pdfs from train folder
def list_train_folder_files(bucket, train_folder):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    train_pdfs = []
    
    blobs = bucket.list_blobs(prefix=train_folder)
    for blob in blobs:
        if blob.name.endswith('.pdf'):
            train_pdfs.append(blob.name)
       
    return train_pdfs

# Prompt

In [30]:
# Prompt

# List PDF files and find corresponding JSON output files
train_pdf_list = list_train_folder_files(bucket_name, train_folder)

# Construct the prompt for each input and its corresponding output JSON
train_prompt = []

# Initial prompt setup
train_prompt.append("""You are a Sustainable AI trained to suggesting the most CO2 efficient Google Cloud Platform technical stack based on user usecase.
Understand and use the following pieces of context to answer the question at the end. Think step-by-step and then answer. Always explain why you are answering the question the way you are.
""")


# Prepare train prompt
for train_pdf in train_pdf_list:
    train_pdf_uri = f"gs://{bucket_name}/{train_pdf}"


    # Append dynamically generated content to the prompt string
    train_prompt += [
        "Context Input Text:  ",
        Part.from_uri(mime_type="application/pdf", uri=train_pdf_uri)
    ]
train_prompt

['You are a Sustainable AI trained to suggesting the most CO2 efficient Google Cloud Platform technical stack based on user usecase.\nUnderstand and use the following pieces of context to answer the question at the end. Think step-by-step and then answer. Always explain why you are answering the question the way you are.\n',
 'Context Input Text:  ',
 file_data {
   mime_type: "application/pdf"
   file_uri: "gs://sustainable-ai/documents2/accelerating-climate-action-ai.pdf"
 },
 'Context Input Text:  ',
 file_data {
   mime_type: "application/pdf"
   file_uri: "gs://sustainable-ai/documents2/alphabet-2023-cdp-climate-change-response.pdf"
 },
 'Context Input Text:  ',
 file_data {
   mime_type: "application/pdf"
   file_uri: "gs://sustainable-ai/documents2/google-2022-climate-action-progress-update.pdf"
 },
 'Context Input Text:  ',
 file_data {
   mime_type: "application/pdf"
   file_uri: "gs://sustainable-ai/documents2/google-2023-environmental-report-executive-summary.pdf"
 },
 'Cont

In [31]:
# Gemini 1.5 model

vertexai.init(project=project_id, location=location)
model = GenerativeModel("gemini-1.5-pro-preview-0409")

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 0.2,
    "top_p": 0.4,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}


# LLM Model
def generate_response(prompt):
    responses = model.generate_content(
        prompt,
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False    
    )
    return responses


In [32]:
test_prompt=["I want to design a PDF to JSON extractor using Gemini PRo. I want to use GCS for storage and vertex ai workbench. Suggest me the most efficient GCP solution stack to build this out"]

combined_prompt = train_prompt+test_prompt
print(combined_prompt)

output_response = generate_response(combined_prompt)
output_text = output_response.text

['You are a Sustainable AI trained to suggesting the most CO2 efficient Google Cloud Platform technical stack based on user usecase.\nUnderstand and use the following pieces of context to answer the question at the end. Think step-by-step and then answer. Always explain why you are answering the question the way you are.\n', 'Context Input Text:  ', file_data {
  mime_type: "application/pdf"
  file_uri: "gs://sustainable-ai/documents2/accelerating-climate-action-ai.pdf"
}
, 'Context Input Text:  ', file_data {
  mime_type: "application/pdf"
  file_uri: "gs://sustainable-ai/documents2/alphabet-2023-cdp-climate-change-response.pdf"
}
, 'Context Input Text:  ', file_data {
  mime_type: "application/pdf"
  file_uri: "gs://sustainable-ai/documents2/google-2022-climate-action-progress-update.pdf"
}
, 'Context Input Text:  ', file_data {
  mime_type: "application/pdf"
  file_uri: "gs://sustainable-ai/documents2/google-2023-environmental-report-executive-summary.pdf"
}
, 'Context Input Text:  

In [33]:
output_text

'## Sustainable AI suggests CO2-efficient GCP stack for PDF to JSON extractor:\n\n**Understanding the Use Case:**\n\n* **Functionality:** Extract text and structure from PDF documents and convert them to JSON format.\n* **Storage:** Utilize Google Cloud Storage (GCS) for storing input PDFs and output JSON files.\n* **Processing:** Leverage Vertex AI Workbench for development and potentially model training/inference if needed.\n\n**CO2 Efficiency Considerations:**\n\n* **Minimize resource usage:** Choose the right services and configurations to avoid unnecessary energy consumption.\n* **Location optimization:** Select regions with lower carbon intensity for data centers and processing.\n* **Renewable energy:** Prioritize regions and services powered by renewable energy sources.\n\n**Suggested GCP Stack:**\n\n1. **Storage:**\n    * **GCS:** Store input PDFs and output JSON files in GCS buckets. Choose a regional or multi-regional location based on data residency and access needs. Conside