### setup

In [None]:
import socket
import re

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

SVC_ACC = !(gcloud config get-value core/account)
SVC_ACC = SVC_ACC[0]

PROJECT_NUMBER=str(re.search(r'\d+', SVC_ACC).group())

LOCATION="us-central1"

UNIQUE_PREFIX = socket.gethostname()
UNIQUE_PREFIX = re.sub('[^A-Za-z0-9]+', '', UNIQUE_PREFIX)

BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}-{LOCATION}"

BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud storage buckets create {BUCKET_URI} --project={PROJECT_ID} --location={LOCATION}
! mkdir output

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

In [None]:
# from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_vertexai import VertexAI
from langchain_google_vertexai import VertexAIEmbeddings


# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os 
from dotenv import load_dotenv
load_dotenv()

db_file_path='FAISS_Index'
embeddings = VertexAIEmbeddings('textembedding-gecko@latest')

def creation_of_vectorDB_in_local(loader):
    data = loader.load()
    db =FAISS.from_documents(data, embeddings)
    db.save_local(db_file_path)

def creation_FAQ_chain():
    db=FAISS.load_local(db_file_path, embeddings)
    retriever =db.as_retriever(score_threshold=0.7)
    
    # llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.2)

    # To use model
    llm = VertexAI(model_name="gemini-pro")

    prompt_temp="""Given the following context and a question, generate an answer based on this context only.
    In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
    If the answer is not found in the context, kindly state "This Question not Present in My Database." Don't try to make up an answer.
    CONTEXT: {context}
    QUESTION: {question}"""

    PROMPT = PromptTemplate(template=prompt_temp, input_variables=["context", "question"])
    chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", 
                                        retriever=retriever, 
                                        input_key="query", 
                                        return_source_documents=False,
                                        chain_type_kwargs={"prompt" : PROMPT})
    return chain


In [8]:
#@title ### You will need to update these values

import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)


In [9]:
import pandas as pd
import seaborn as sns
from IPython.display import Markdown, display
from sklearn.metrics.pairwise import cosine_similarity
from vertexai.preview.language_models import (ChatModel, InputOutputTextPair,
                                              TextEmbeddingModel,
                                              TextGenerationModel)
from google.cloud import aiplatform_v1beta1, aiplatform
from google.protobuf import struct_pb2
import numpy as np

from tenacity import retry, stop_after_attempt, wait_random_exponential


In [46]:
import pandas as pd

# Replace 'your_file.csv' with the actual filename
df = pd.read_csv('Singpost_QnA_doc.csv')

# print("\nColumn names and types:")
# print(df.info())

data = df["question"].tolist()



In [48]:
prompt_list = df['question'].tolist()

df["embedding"] = [    emb.values for emb in embedding_model.get_embeddings(prompt_list) ]
df

Unnamed: 0,Prompt,answer,embedding
0,How do I exchange my existing commercial vehicle?,To exchange your commercial vehicle click on t...,"[0.020373761653900146, -0.05186869204044342, -..."
1,How do I get to know the final offer price aft...,The booking amount paid by you Is used to simp...,"[0.0017610692884773016, -0.028775252401828766,..."
2,How do I know my booking has been confirmed?,"Once you have completed the payment, you will ...","[0.01797533594071865, -0.04220637306571007, -0..."
3,Where can I watch a video to learn more about ...,The Help and Booking Guide buttons are present...,"[0.009226418100297451, -0.021474039182066917, ..."
4,How do I calculate the EMI for the commercial ...,"Once you have opened a vehicle page, you can s...","[0.005750755779445171, -0.07312288880348206, -..."
5,What is the maximum price of the car,one core,"[-0.014892088249325752, -0.0554826520383358, -..."
6,What is the minimum price of the car,ten lakh,"[-0.001161880325525999, -0.06594191491603851, ..."
7,who is the programmer of the web portal,Unknown coder,"[0.02974068373441696, -0.016449231654405594, -..."
8,which year the sell of cars were high,2019,"[0.019806981086730957, -0.004369750618934631, ..."
9,when the sales was down,2020,"[0.04185187444090843, -0.013278556056320667, -..."


In [50]:
db = FAISS.from_texts(data, embeddings)


In [58]:
query = "can i know the year of sales going down"
docs = db.similarity_search_with_score(query) #similarity_search
# print('first result',docs[0].page_content)
print(docs)

[(Document(page_content='when the sales was down'), 0.5942349), (Document(page_content='which year the sell of cars were high'), 0.6114755), (Document(page_content='what is the profit in 2020'), 0.7008934), (Document(page_content='how much of workers works in the company'), 0.8139192)]


In [57]:
# print('first result',docs[0][0].page_content)


In [59]:
# !gsutil cp  ./Singpost_QnA_doc.csv gs://my-project-0004-bucket02/llms

Copying file://./Singpost_QnA_doc.csv [Content-Type=text/csv]...
/ [1 files][  4.8 KiB/  4.8 KiB]                                                
Operation completed over 1 objects/4.8 KiB.                                      
