### setup

In [2]:
import socket
import re

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

SVC_ACC = !(gcloud config get-value core/account)
SVC_ACC = SVC_ACC[0]

PROJECT_NUMBER=str(re.search(r'\d+', SVC_ACC).group())

LOCATION="us-central1"

UNIQUE_PREFIX = socket.gethostname()
UNIQUE_PREFIX = re.sub('[^A-Za-z0-9]+', '', UNIQUE_PREFIX)

BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}-{LOCATION}"

BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud storage buckets create {BUCKET_URI} --project={PROJECT_ID} --location={LOCATION}
! mkdir output

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

Updated property [core/project].
Creating gs://my-project-0004-346516-pytorch112kagglewbi-us-central1/...
[1;31mERROR:[0m (gcloud.storage.buckets.create) HTTPError 409: Your previous request to create the named bucket succeeded and you already own it.
mkdir: cannot create directory ‘output’: File exists


### helper funtion 

In [3]:
# from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings


# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os 
from dotenv import load_dotenv
load_dotenv()

db_file_path='FAISS_Index'
embeddings = VertexAIEmbeddings('textembedding-gecko@latest')

def creation_of_vectorDB_in_local(loader):
    data = loader.load()
    db =FAISS.from_documents(data, embeddings)
    db.save_local(db_file_path)

def creation_FAQ_chain():
    db=FAISS.load_local(db_file_path, embeddings)
    retriever =db.as_retriever(score_threshold=0.7)
    
    # llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.2)

    # To use model
    llm = VertexAI(model_name="gemini-pro")

    prompt_temp="""Given the following context and a question, generate an answer based on this context only.
    In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
    If the answer is not found in the context, kindly state "This Question not Present in My Database." Don't try to make up an answer.
    CONTEXT: {context}
    QUESTION: {question}"""

    PROMPT = PromptTemplate(template=prompt_temp, input_variables=["context", "question"])
    chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", 
                                        retriever=retriever, 
                                        input_key="query", 
                                        return_source_documents=False,
                                        chain_type_kwargs={"prompt" : PROMPT})
    return chain


In [4]:
#@title ### You will need to update these values

import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)


### vertex AI

In [5]:
import pandas as pd
import seaborn as sns
from IPython.display import Markdown, display
from sklearn.metrics.pairwise import cosine_similarity
from vertexai.preview.language_models import (ChatModel, InputOutputTextPair,   TextEmbeddingModel,
                                              TextGenerationModel)
from google.cloud import aiplatform_v1beta1, aiplatform
from google.protobuf import struct_pb2
import numpy as np

from tenacity import retry, stop_after_attempt, wait_random_exponential

from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

model_name = 'textembedding-gecko@latest'
embeddings = VertexAIEmbeddings(model_name)
embedding_model = TextEmbeddingModel.from_pretrained(model_name)


In [16]:
import pandas as pd

# Replace 'your_file.csv' with the actual filename
df = pd.read_csv('Singpost_QnA_doc.csv')

# print("\nColumn names and types:")
# print(df.info())

data = df["question"].tolist()



In [17]:
prompt_list = df['question'].tolist()

df["embedding"] = [    emb.values for emb in embedding_model.get_embeddings(prompt_list) ]
df

Unnamed: 0,question,answer,embedding
0,I need to update my email address,As each account is tied to a unique email addr...,"[0.00035269002546556294, -0.007924865931272507..."
1,How do I get my package shipped,Your unique VP ID that forms part of your over...,"[0.03443169221282005, -0.02751363255083561, 0...."
2,How do I navigate the members portal,Refer to the following image(s) https://drive....,"[0.031672779470682144, -0.020744403824210167, ..."
3,How do I navigate the address section on the p...,Refer to the following image(s) https://drive....,"[0.04048671945929527, -0.027499673888087273, -..."
4,How do I perform declaration on my package,Refer to the following image(s) https://drive....,"[0.03648605942726135, -0.0060289218090474606, ..."
5,What is SED and what does the customer need to...,SED refers to the United States Shipper’s Expo...,"[-0.003393965307623148, -0.05115535482764244, ..."
6,How do I navigate the package summary,Refer to the following image(s) https://drive....,"[0.04750156030058861, -0.03618474677205086, -0..."
7,How do I track my package on the portal,Refer to the following image(s) https://drive....,"[0.03705435246229172, -0.02546788938343525, -0..."
8,What kind of cases can CS Ops assist?,Item matched to wrong VP number (provide suppo...,"[0.02198229357600212, -0.04881160706281662, -0..."
9,What kind of case can vPost FFPs assist with?,1. Request to match to correct owner / Item ta...,"[-0.0069028097204864025, -0.035859040915966034..."


In [18]:
db = FAISS.from_texts(data, embeddings)


In [19]:
query = "can i know the year of sales going down"
docs = db.similarity_search_with_score(query) #similarity_search
# print('first result',docs[0].page_content)
print(docs)

[(Document(page_content='Is bundling discount available?'), 0.80268395), (Document(page_content='I need to update my email address'), 0.84324044), (Document(page_content='What is SED and what does the customer need to do?'), 0.84402), (Document(page_content='How do I get my package shipped'), 0.872614)]


In [20]:
# print('first result',docs[0][0].page_content)


In [21]:
# !gsutil cp  ./Singpost_QnA_doc.csv gs://my-project-0004-bucket02/llms

In [22]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever


In [24]:
data

['I need to update my email address',
 'How do I get my package shipped',
 'How do I navigate the members portal',
 'How do I navigate the address section on the portal',
 'How do I perform declaration on my package',
 'What is SED and what does the customer need to do?',
 'How do I navigate the package summary',
 'How do I track my package on the portal',
 'What kind of cases can CS Ops assist?',
 'What kind of case can vPost FFPs assist with?',
 'Which team can provide assistance for unclaimed vPost packages',
 'Is bundling discount available?',
 'What is VP ID']

In [29]:
# %pip install rank_bm25

In [30]:
# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    data, metadatas=[{"source": 1}] * len(data)
)
bm25_retriever.k = 2


In [32]:
faiss_vectorstore = FAISS.from_texts(
    data, embeddings, metadatas=[{"source": 2}] * len(data)
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})


In [33]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [34]:
docs = ensemble_retriever.invoke("VP ID")
docs

[Document(page_content='What is VP ID', metadata={'source': 1}),
 Document(page_content='Is bundling discount available?', metadata={'source': 1}),
 Document(page_content='Which team can provide assistance for unclaimed vPost packages', metadata={'source': 2})]

In [35]:
docs = ensemble_retriever.invoke("FFPs")
docs

[Document(page_content='What kind of case can vPost FFPs assist with?', metadata={'source': 1}),
 Document(page_content='What is VP ID', metadata={'source': 1}),
 Document(page_content='Is bundling discount available?', metadata={'source': 2})]

In [38]:
query = "what are bundling discount ?"
docs = ensemble_retriever.invoke(query)
docs

[Document(page_content='Is bundling discount available?', metadata={'source': 1}),
 Document(page_content='What is SED and what does the customer need to do?', metadata={'source': 1})]

In [39]:
retrieved_result = """

Question : 'Is bundling discount available?'
answer : 'Bundling Discount is only offered to vPost SG Customers only. The lowest-priced package will automatically be selected as the first Package to provide maximum discount and value. Discount will apply only on the second and subsequent packages Bunding discount applies when customer ship multiple package in one shipment.' 

"""

In [40]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models

def generate():
  vertexai.init(project=PROJECT_ID, location="us-central1")
  model = GenerativeModel(
    "gemini-1.5-flash-001",
  )
  responses = model.generate_content(
      [text1],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=True,
  )

  for response in responses:
    print(response.text, end="")

text1 = retrieved_result

generation_config = {
    "max_output_tokens": 5147,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

generate()

This answer is a bit confusing and could be improved. Here's a breakdown of the issues and suggestions for improvement:

**Issues:**

* **Too much information at once:** The answer throws a lot of details at the reader without a clear structure. 
* **Unclear wording:**  "Lowest-priced package will automatically be selected as the first Package" is confusing.
* **Missing key information:**  What is a "vPost SG Customer"? How do customers know they qualify?
* **No clear action:** It doesn't tell the customer what to do next if they want to take advantage of the discount.

**Improved Answer:**

Here's a way to rephrase the answer to be more user-friendly and informative:

"Yes, we offer a bundling discount! Here's how it works:

* **For vPost SG Customers only:**  If you are a vPost SG customer, you can save by shipping multiple packages together. 
* **Automatic discount:**  When you ship multiple packages in one shipment, we'll automatically apply a discount to your second and subsequent

### Calling vertex Search to SDK

#### Method 001

In [14]:
import vertexai

from vertexai.preview.generative_models import grounding
from vertexai.generative_models import GenerationConfig, GenerativeModel, Tool

# TODO(developer): Update and un-comment below line
# project_id = "PROJECT_ID"

vertexai.init(project=PROJECT_ID, location="us-central1")

model = GenerativeModel(model_name="gemini-1.0-pro-002")

# Use Vertex AI Search data store
# Format: projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{data_store_id}

data_store_path =  'projects/255766800726/locations/global/collections/default_collection/dataStores/singpost-pdf-per-page-qn_1717131756893'

tool = Tool.from_retrieval(
    grounding.Retrieval(grounding.VertexAISearch(datastore=data_store_path))
)

prompt = "which bundling discount available?"
response = model.generate_content(
    prompt,
    tools=[tool],
    generation_config=GenerationConfig(
        temperature=0.1,
    ),
)

print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "Bundling Discount is only offered to vPost SG Customers. The lowest-priced package will automatically be selected as the first Package to provide maximum discount and value. Discount will apply only on the second and subsequent packages. Bunding discount applies when customer ship multiple package in one shipment."
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
    probability_score: 0.09285216
    severity: HARM_SEVERITY_NEGLIGIBLE
    severity_score: 0.13741668
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
    probability_score: 0.18922126
    severity: HARM_SEVERITY_NEGLIGIBLE
    severity_score: 0.11456649
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
    probability_score: 0.101055905
    severity: HARM_SEVERITY_NEGLIGIBLE
    severity_score: 0.17133

In [None]:
# curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" \
# -H "Content-Type: application/json" \
# "https://discoveryengine.googleapis.com/v1alpha/projects/255766800726/locations/global/collections/default_collection/dataStores/singpost-pdf-per-page-qn_1717131756893/servingConfigs/default_search:search" \
# -d '{"query":"<QUERY>","pageSize":10,"queryExpansionSpec":{"condition":"AUTO"},"spellCorrectionSpec":{"mode":"AUTO"},"contentSearchSpec":{"summarySpec":{"summaryResultCount":5,"modelSpec":{"version":"gemini-1.5-flash-001/answer_gen/v1"},"ignoreAdversarialQuery":true,"includeCitations":true},"snippetSpec":{"returnSnippet":true},"extractiveContentSpec":{"maxExtractiveAnswerCount":1}}}'

#### Method 002

In [10]:
# from typing import List

# from google.api_core.client_options import ClientOptions
# from google.cloud import discoveryengine_v1 as discoveryengine

# # TODO(developer): Uncomment these variables before running the sample.
# project_id = PROJECT_ID #"YOUR_PROJECT_ID"
# location = LOCATION #"YOUR_LOCATION"          # Values: "global", "us", "eu"
# engine_id = "singpost-pdf_1717131715752"
# dataStores = "singpost-pdf-per-page-qn_1717131756893"

# search_query = prompt = "which bundling discount available?"


# def search_sample(
#     project_id: str,
#     location: str,
#     dataStores: str,
#     search_query: str,
# ) -> List[discoveryengine.SearchResponse]:
#     #  For more information, refer to:
#     # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
#     client_options = (
#         ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
#         if location != "global"
#         else None
#     )

#     # Create a client
#     client = discoveryengine.SearchServiceClient(client_options=client_options)

#     # The full resource name of the search app serving config
#     serving_config = f"projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{dataStores}/servingConfigs/default_config"

#     # Optional: Configuration options for search
#     # Refer to the `ContentSearchSpec` reference for all supported fields:
#     # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec
#     content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(
#         # For information about snippets, refer to:
#         # https://cloud.google.com/generative-ai-app-builder/docs/snippets
#         snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
#             return_snippet=True
#         ),
#         # For information about search summaries, refer to:
#         # https://cloud.google.com/generative-ai-app-builder/docs/get-search-summaries
#         summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(
#             summary_result_count=5,
#             include_citations=True,
#             ignore_adversarial_query=True,
#             ignore_non_summary_seeking_query=True,
#             model_prompt_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec.ModelPromptSpec(
#                 preamble="YOUR_CUSTOM_PROMPT"
#             ),
#             model_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec.ModelSpec(
#                 version="stable",
#             ),
#         ),
#     )

#     # Refer to the `SearchRequest` reference for all supported fields:
#     # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest
#     request = discoveryengine.SearchRequest(
#         serving_config=serving_config,
#         query=search_query,
#         page_size=10,
#         content_search_spec=content_search_spec,
#         query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(
#             condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,
#         ),
#         spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(
#             mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO
#         ),
#     )

#     response = client.search(request)
#     print(response)

#     return response


In [12]:
# search_sample(PROJECT_ID, LOCATION ,dataStores ,prompt)