### setup

In [3]:
import socket
import re

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

SVC_ACC = !(gcloud config get-value core/account)
SVC_ACC = SVC_ACC[0]

PROJECT_NUMBER=str(re.search(r'\d+', SVC_ACC).group())

LOCATION="us-central1"

UNIQUE_PREFIX = socket.gethostname()
UNIQUE_PREFIX = re.sub('[^A-Za-z0-9]+', '', UNIQUE_PREFIX)

BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}-{LOCATION}"

BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud storage buckets create {BUCKET_URI} --project={PROJECT_ID} --location={LOCATION}
! mkdir output

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey

Creating gs://my-project-0004-346516-pytorch112kagglewbi-us-central1/...
[1;31mERROR:[0m (gcloud.storage.buckets.create) HTTPError 409: Your previous request to create the named bucket succeeded and you already own it.
mkdir: cannot create directory ‘output’: File exists


### helper funtion 

In [4]:
# from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings


# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os 
from dotenv import load_dotenv
load_dotenv()

db_file_path='FAISS_Index'
embeddings = VertexAIEmbeddings('textembedding-gecko@latest')

def creation_of_vectorDB_in_local(loader):
    data = loader.load()
    db =FAISS.from_documents(data, embeddings)
    db.save_local(db_file_path)

def creation_FAQ_chain():
    db=FAISS.load_local(db_file_path, embeddings)
    retriever =db.as_retriever(score_threshold=0.7)
    
    # llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.2)

    # To use model
    llm = VertexAI(model_name="gemini-pro")

    prompt_temp="""Given the following context and a question, generate an answer based on this context only.
    In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
    If the answer is not found in the context, kindly state "This Question not Present in My Database." Don't try to make up an answer.
    CONTEXT: {context}
    QUESTION: {question}"""

    PROMPT = PromptTemplate(template=prompt_temp, input_variables=["context", "question"])
    chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", 
                                        retriever=retriever, 
                                        input_key="query", 
                                        return_source_documents=False,
                                        chain_type_kwargs={"prompt" : PROMPT})
    return chain


In [5]:
#@title ### You will need to update these values

import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)


### vertex AI

In [15]:
import pandas as pd
import seaborn as sns
from IPython.display import Markdown, display
from sklearn.metrics.pairwise import cosine_similarity
from vertexai.preview.language_models import (ChatModel, InputOutputTextPair,   TextEmbeddingModel,
                                              TextGenerationModel)
from google.cloud import aiplatform_v1beta1, aiplatform
from google.protobuf import struct_pb2
import numpy as np

from tenacity import retry, stop_after_attempt, wait_random_exponential

from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

model_name = 'textembedding-gecko@latest'
embeddings = VertexAIEmbeddings(model_name)
embedding_model = TextEmbeddingModel.from_pretrained(model_name)


In [16]:
import pandas as pd

# Replace 'your_file.csv' with the actual filename
df = pd.read_csv('Singpost_QnA_doc.csv')

# print("\nColumn names and types:")
# print(df.info())

data = df["question"].tolist()



In [17]:
prompt_list = df['question'].tolist()

df["embedding"] = [    emb.values for emb in embedding_model.get_embeddings(prompt_list) ]
df

Unnamed: 0,question,answer,embedding
0,I need to update my email address,As each account is tied to a unique email addr...,"[0.00035269002546556294, -0.007924865931272507..."
1,How do I get my package shipped,Your unique VP ID that forms part of your over...,"[0.03443169221282005, -0.02751363255083561, 0...."
2,How do I navigate the members portal,Refer to the following image(s) https://drive....,"[0.031672779470682144, -0.020744403824210167, ..."
3,How do I navigate the address section on the p...,Refer to the following image(s) https://drive....,"[0.04048671945929527, -0.027499673888087273, -..."
4,How do I perform declaration on my package,Refer to the following image(s) https://drive....,"[0.03648605942726135, -0.0060289218090474606, ..."
5,What is SED and what does the customer need to...,SED refers to the United States Shipper’s Expo...,"[-0.003393965307623148, -0.05115535482764244, ..."
6,How do I navigate the package summary,Refer to the following image(s) https://drive....,"[0.04750156030058861, -0.03618474677205086, -0..."
7,How do I track my package on the portal,Refer to the following image(s) https://drive....,"[0.03705435246229172, -0.02546788938343525, -0..."
8,What kind of cases can CS Ops assist?,Item matched to wrong VP number (provide suppo...,"[0.02198229357600212, -0.04881160706281662, -0..."
9,What kind of case can vPost FFPs assist with?,1. Request to match to correct owner / Item ta...,"[-0.0069028097204864025, -0.035859040915966034..."


In [18]:
db = FAISS.from_texts(data, embeddings)


In [19]:
query = "can i know the year of sales going down"
docs = db.similarity_search_with_score(query) #similarity_search
# print('first result',docs[0].page_content)
print(docs)

[(Document(page_content='Is bundling discount available?'), 0.80268395), (Document(page_content='I need to update my email address'), 0.84324044), (Document(page_content='What is SED and what does the customer need to do?'), 0.84402), (Document(page_content='How do I get my package shipped'), 0.872614)]


In [20]:
# print('first result',docs[0][0].page_content)


In [21]:
# !gsutil cp  ./Singpost_QnA_doc.csv gs://my-project-0004-bucket02/llms

In [22]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever


In [24]:
data

['I need to update my email address',
 'How do I get my package shipped',
 'How do I navigate the members portal',
 'How do I navigate the address section on the portal',
 'How do I perform declaration on my package',
 'What is SED and what does the customer need to do?',
 'How do I navigate the package summary',
 'How do I track my package on the portal',
 'What kind of cases can CS Ops assist?',
 'What kind of case can vPost FFPs assist with?',
 'Which team can provide assistance for unclaimed vPost packages',
 'Is bundling discount available?',
 'What is VP ID']

In [29]:
# %pip install rank_bm25

In [30]:
# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    data, metadatas=[{"source": 1}] * len(data)
)
bm25_retriever.k = 2


In [32]:
faiss_vectorstore = FAISS.from_texts(
    data, embeddings, metadatas=[{"source": 2}] * len(data)
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})


In [33]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [34]:
docs = ensemble_retriever.invoke("VP ID")
docs

[Document(page_content='What is VP ID', metadata={'source': 1}),
 Document(page_content='Is bundling discount available?', metadata={'source': 1}),
 Document(page_content='Which team can provide assistance for unclaimed vPost packages', metadata={'source': 2})]

In [35]:
docs = ensemble_retriever.invoke("FFPs")
docs

[Document(page_content='What kind of case can vPost FFPs assist with?', metadata={'source': 1}),
 Document(page_content='What is VP ID', metadata={'source': 1}),
 Document(page_content='Is bundling discount available?', metadata={'source': 2})]

In [36]:
query = "can i know the year of sales going down"
docs = ensemble_retriever.invoke(query)
docs

[Document(page_content='What kind of cases can CS Ops assist?', metadata={'source': 1}),
 Document(page_content='Is bundling discount available?', metadata={'source': 2}),
 Document(page_content='What kind of case can vPost FFPs assist with?', metadata={'source': 1}),
 Document(page_content='I need to update my email address', metadata={'source': 2})]