In [None]:
######################################################################################
#
# project prep
#
######################################################################################

In [None]:
# enable any APIs
#! gcloud services enable aiplatform.googleapis.com
#! gcloud services enable discoveryengine.googleapis.com

In [None]:
# installs - google 
#! pip install -U -q --user google-cloud-aiplatform
#! pip install -U -q --user google-cloud-discoveryengine

In [None]:
######################################################################################
#
# setup
#
######################################################################################

In [1]:
# for Gemini chat model
import vertexai

from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Part,
)

In [2]:
# for RAG using vertex search
import google.cloud.discoveryengine_v1 as discoveryengine

In [3]:
# set params
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
REGION = "us-central1"

In [4]:
# Vertex Agent Builder Data Store - layout parsed for chunks
VS_LOCATION = 'global'

# YOUR DATASTORE ID GOES HERE
VS_DATASTORE_ID = "datastore-1723784659341"

In [5]:
# initialize clients
# vertex
vertexai.init(project = PROJECT_ID, location = REGION)

# discoveryengine
API_ENDPOINT = dict(api_endpoint = (f'{VS_LOCATION}-' if VS_LOCATION != 'global' else '') + 'discoveryengine.googleapis.com')
search_client = discoveryengine.SearchServiceClient(client_options = API_ENDPOINT)

In [6]:
#########################################################
# vertex models config
#########################################################

In [7]:
# model generation settings
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
}

In [8]:
######################################################################################
#
# helper functions
#
######################################################################################

In [9]:
def get_vertex_search_chunks(PROJECT_ID
                             , VS_LOCATION
                             , VS_DATASTORE_ID
                             , prompt):
    
    # SEARCH
    search_results = search_client.search(
        request = discoveryengine.SearchRequest(
            serving_config = f"projects/{PROJECT_ID}/locations/{VS_LOCATION}/collections/default_collection/dataStores/{VS_DATASTORE_ID}/servingConfigs/default_config",
            query = prompt,
            #filter = 'myfilters: ANY("ariya")', # could use this filter to isolate make, model, year if desired
            page_size = 10, 
            content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(
                snippet_spec = discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
                    return_snippet = False
                ),
                
                # this line is needed to return the chucks from the vertex datastore
                search_result_mode = discoveryengine.SearchRequest.ContentSearchSpec.SearchResultMode.CHUNKS,

                # generative summary specs:
                # docs: https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec.SummarySpec
                summary_spec = discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(
                    summary_result_count = 5, # number of documents (or chunks in chunk mode) to use for generative summary
                    include_citations = True,
                    ignore_adversarial_query = True,
                    ignore_non_summary_seeking_query = False,
                    model_spec = discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec.ModelSpec(
                        version = "stable"
                    ),
                ),
                
                # be sure to know how these params affect what is returned -> https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec.ExtractiveContentSpec
                extractive_content_spec = discoveryengine.SearchRequest.ContentSearchSpec.ExtractiveContentSpec(
                    #max_extractive_answer_count = 1, # this could/should be zero bc you do not want vertex search generating the answer
                    max_extractive_segment_count = 5,
                    return_extractive_segment_score = True,
                    num_previous_segments = 0,
                    num_next_segments = 0
                ),
            ),
            query_expansion_spec = discoveryengine.SearchRequest.QueryExpansionSpec(
                condition = discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO
            ),
            spell_correction_spec = discoveryengine.SearchRequest.SpellCorrectionSpec(
                mode = discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO
            )
        )
    )
    
    # PREP SEGMENTS - customize this however you want
    retrieved_context = ""
    for result_idx, result in enumerate(search_results.results):
        retrieved_context += f"<SEGMENT_{result_idx+1}>{result.chunk.content}</SEGMENT_{result_idx+1}>"
    
    return retrieved_context

In [10]:
#######################################
# Demonstrate returned chunks from vertex search
#######################################

In [11]:
prompt = "Find information about gemini chat models"
retrieved_chunks_from_vertex_search = get_vertex_search_chunks(PROJECT_ID, VS_LOCATION, VS_DATASTORE_ID, prompt)
print(retrieved_chunks_from_vertex_search[:2000])

<SEGMENT_1># Google models

## Gemini models

The following table summarizes the models available in the
[Gemini API](/vertex-ai/generative-ai/docs/multimodal/overview). For more information about API details, see the
[Gemini API reference](/vertex-ai/generative-ai/docs/model-reference/gemini). To explore a model in the Google Cloud console, select its model card in the Model Garden.

_START_OF_TABLE_
TABLE_IN_MARKDOWN:
| Model | Inputs | Outputs | Use case | Try the model |
|-|-|-|-|-|
| Gemini 1.5 Flash | Text, code, images, audio, video, video with audio, PDF | Text | Provides speed and efficiency for high-volume, quality, cost-effective apps. | [Try the Gemini 1.5 Flash model](https://console.cloud.google.com/vertex-ai/generative/multimodal/create/text?model=gemini-1.5-flash-001) |
| Gemini 1.5 Pro | Text, code, images, audio, video, video with audio, PDF | Text | Supports text or chat prompts for a text or code response. Supports long-context understanding up to the maximum input 

In [12]:
######################################################################################
#
# Chat model
#
######################################################################################

In [23]:
# define the LLM
#model_name = "gemini-1.5-pro-001"
model_name = "gemini-1.5-flash-001"

In [14]:
# system instructions - persona
system_instruction = "You are a sales associate for Google Cloud focusing on machine learning."

In [24]:
# start chat
chat_model_1 = GenerativeModel(model_name, system_instruction = [system_instruction])

In [40]:
######################################################################################
#
# Chat 1 - Initial Response
#
######################################################################################

In [25]:
%%time

#######################################
# putting it together
#######################################

# retrieve customer question from your interface
customer_question = "until what date is the text-bison model available?"

# instructions
instruction_lines =  [  "Determine and execute the steps that a customer engineer would utilize to answer the CUSTOMER_QUESTION with perfection."
                      , "Use only the SEGMENTS below to answer the CUSTOMER_QUESTION."
                      , "If the SEGMENTS do not contain enough information to answer the CUSTOMER_QUESTION, politely decline to answer and ask if there is any other way you can help."
                     ]

instructions = " ".join(instruction_lines)

# build the prompt
chat_1_prompt =  f"<INSTRUCTIONS>{instructions}</INSTRUCTIONS>"
chat_1_prompt += f"\n\n"
chat_1_prompt +=  f"<CUSTOMER_QUESTION>{customer_question}</CUSTOMER_QUESTION>"
chat_1_prompt += f"\n\n"
chat_1_prompt += f"<SEGMENTS>{ get_vertex_search_chunks(PROJECT_ID, VS_LOCATION, VS_DATASTORE_ID, customer_question) }</SEGMENTS>"

# start the chat
chat_1 = chat_model_1.start_chat()
chat_response_1 = chat_1.send_message([chat_1_prompt]
                      , generation_config=generation_config
                      , safety_settings=safety_settings
                     )

chat_response_1_text = chat_response_1.candidates[0].content.parts[0].text

print(chat_response_1.usage_metadata)
print("")
print("-"*30)
print("")
print(chat_response_1_text)

prompt_token_count: 7427
candidates_token_count: 42
total_token_count: 7469


------------------------------

The text-bison model (version 002) will be available until **October 9, 2024**. 

Is there anything else I can help you with today? 

CPU times: user 22.8 ms, sys: 2.11 ms, total: 24.9 ms
Wall time: 2.21 s


In [65]:
######################################################################################
#
# Customer asks a second question...
#
######################################################################################

In [27]:
%%time

# retrieve customer question from your interface
customer_question = "When would I use LlamaIndex on Vertex AI for RAG versus LangChain on Vertex AI aka Reasoning Engine?"

# instructions could change
instruction_lines =  [  "Determine and execute the steps that a customer engineer would utilize to answer the CUSTOMER_QUESTION with perfection."
                      , "Use only the SEGMENTS below to answer the CUSTOMER_QUESTION."
                      , "If the SEGMENTS do not contain enough information to answer the CUSTOMER_QUESTION, politely decline to answer and ask if there is any other way you can help."
                     ]

instructions = " ".join(instruction_lines)

# build the prompt
chat_continuance =  f"<INSTRUCTIONS>{instructions}</INSTRUCTIONS>"
chat_continuance += f"\n\n"
chat_continuance +=  f"<CUSTOMER_QUESTION>{customer_question}</CUSTOMER_QUESTION>"
chat_continuance += f"\n\n"
chat_continuance += f"<SEGMENTS>{ get_vertex_search_chunks(PROJECT_ID, VS_LOCATION, VS_DATASTORE_ID, customer_question) }</SEGMENTS>"


chat_response_1 = chat_1.send_message([chat_continuance]
                      , generation_config=generation_config
                      , safety_settings=safety_settings
                     )

chat_continuance_response_text = chat_response_1.candidates[0].content.parts[0].text

print(chat_response_1.usage_metadata)
print("")
print("-"*30)
print("")
print(chat_continuance_response_text)

prompt_token_count: 24192
candidates_token_count: 490
total_token_count: 24682


------------------------------

Here's a breakdown of when to use LlamaIndex on Vertex AI for RAG versus LangChain on Vertex AI (Reasoning Engine) based on the provided information:

**LlamaIndex on Vertex AI for RAG**

* **Focus:** Primarily designed for **retrieval-augmented generation (RAG)**. It excels at building a knowledge base from your data (documents, files) and retrieving relevant information for use with LLMs.
* **Strengths:**
    * **Simple to use:**  Good for developers familiar with LlamaIndex.
    * **Efficient retrieval:** Optimized for knowledge base construction and search.
    * **Google-managed infrastructure:** Takes care of scaling, security, etc.
* **When to use:**
    * You have a large corpus of documents you need to make searchable and use for RAG.
    * You need a straightforward, managed solution for building and querying a knowledge base.

**LangChain on Vertex AI (Reasoning E