# DAY 2 : RAG

This is an example for all steps in a basic RAG solution 
steps: 

1. Setup :  chunk/embed/store
2. R-etrival :  embed & do similarity search
3. A -ugmnet :  optionally rerank & add to response
4. G - enerate : ask LLm to answer question based on the retrieved chunks

Questions use #generative-ai-users  or #igiu-innovation-lab slack channel


### Set up variables

In [24]:
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import OnDemandServingMode, EmbedTextDetails,CohereChatRequest, ChatDetails
import oracledb
import array
import oci
import os,json 


#####
#make sure your sandbox.json file is setup for your environment. You might have to specify the full path depending on  your `cwd` 
#####
SANDBOX_CONFIG_FILE = "sandbox.json"

EMBED_MODEL = "cohere.embed-multilingual-v3.0"
LLM_MODEL = "cohere.command-r-08-2024" 
llm_service_endpoint= "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"

tablename_prefix = None
compartmentId = None

## chunks we we want to query against 

In [25]:

# here we are starting with samll chunks. Idelaly you will have to parse teh file and chunk it using a library
# there are quite a few strategies on parsing and chunking. do you wn ownresearch and enahcne this code. 
chunks = [
    			"Baseball is a great game ",
				"baseball games typically last 9 innings",
				"Baseball game can finish in about 2 hours",
				"Indias favroite passtime is cricket",
				"England's favorite passtime is football",
				"Football is called soccer in America",
				"baseball is americas favroite pass time sport"]

# we are mocking the cource of chunks. this will be used in citations. This helps build confidence, avoid hallucination. 
chunk_source = [ 
                {"chapter":"Baseball", "question":"1"},
                {"chapter":"Baseball", "question":"2"},
                {"chapter":"Baseball", "question":"3"},
                {"chapter":"Cricket", "question":"1"},
                {"chapter":"Football", "question":"1"},
                {"chapter":"Football", "question":"2"},
                {"chapter":"Baseball", "question":"4"},
	
]

## open database connection

In [26]:
scfg = None
# read the sandbox config 
with open(os.path.expanduser(SANDBOX_CONFIG_FILE), 'r') as f:
                scfg=  json.load(f)
                
config = oci.config.from_file(os.path.expanduser(scfg["oci"]["configFile"]),scfg["oci"]["profile"])
compartmentId = scfg["oci"]["compartment"]
tablename_prefix = scfg["db"]["tablePrefix"]
wallet = os.path.expanduser(scfg["db"]["walletPath"])
                
db = oracledb.connect(  config_dir=scfg["db"]["walletPath"],user= scfg["db"]["username"], password=scfg["db"]["password"], dsn=scfg["db"]["dsn"],wallet_location=scfg["db"]["walletPath"],wallet_password=scfg["db"]["walletPass"])
cursor = db.cursor()

## create tables 

In [27]:
sql = [
		f"""drop table if exists {tablename_prefix}_embedding purge"""	,
  
		f"""
 		create table {tablename_prefix}_embedding (
   			id number,
			text varchar2(4000),
			vec vector,
			chapter varchar2(100),
			section integer,
			primary key (id)
		)"""
	]
 
for s in sql : 
		cursor.execute(s)

db.commit()

### set up LLM client 

In [28]:


# create a llm client 
llm_client = GenerativeAiInferenceClient(
				config=config, 
				service_endpoint=llm_service_endpoint, 
				retry_strategy=oci.retry.NoneRetryStrategy(),
				timeout=(10,240))	


## Create embeddings

In [29]:
embed_text_detail = EmbedTextDetails()
embed_text_detail.serving_mode = OnDemandServingMode(model_id=EMBED_MODEL)
embed_text_detail.truncate = embed_text_detail.TRUNCATE_END
embed_text_detail.input_type = EmbedTextDetails.INPUT_TYPE_SEARCH_DOCUMENT 
embed_text_detail.compartment_id = compartmentId
embed_text_detail.inputs = chunks

response = llm_client.embed_text(embed_text_detail)
embeddings = response.data.embeddings


## insert embedding in database

In [None]:
for i in range(len(embeddings)):
    cursor.execute(f"insert into {tablename_prefix}_embedding values (:1, :2, :3, :4, :5)", 
                   [i, chunks[i], array.array("f",embeddings[i]),chunk_source[i]["chapter"], chunk_source[i]["question"] ])
    print(f"inserted {i}-{chunks[i]}")

print("commiting")
db.commit()

## read the table 

In [None]:
cursor.execute(f"select id,text from {tablename_prefix}_embedding")
for row in cursor:
	print(f"{row[0]}:{row[1]}")

## Ask A question to answer

In [32]:
query = input("Ask a question: ").strip().lower()
q=[]
q.append(query)

## embed the query

we nede to do the "R" part of rag - retrieve.  we retrieve in following steps
1. embed the query test
1. do a similarity serach to find the text similar to it 
2. optionally rerank it 

In [33]:

# embed

embed_text_detail.inputs = q
embed_text_detail.input_type = EmbedTextDetails.INPUT_TYPE_SEARCH_QUERY
response = llm_client.embed_text(embed_text_detail)
vec = array.array("f",response.data.embeddings[0])




In [None]:
# simialrity search of embedded text 
 
# There are multiple search algorithms: COSINE, DOT, EUCLIDEAN
cursor.execute(f"""
		select id,text, vector_distance(vec, :1, COSINE) d, chapter,section 
		from {tablename_prefix}_embedding
		order by d
		fetch first 3 rows only
	""", [vec])

rows =[]
for row in cursor:
	r = [row[0], row[1], row[2], f"chapter:[{row[3]}]_section:[{row[4]}]"]
	print(r)
	rows.append(r)


print (rows)

### optionally rerank

In [35]:
# look at cohere reranking example 

## A of RAG : augment

we attach the retrieved chucks as documents to chat request 

In [36]:
# prepare the payloafd 
# chat request 
cohere_chat_request = CohereChatRequest()
#cohere_chat_request.preamble_override = "you always answer in a one stanza poem."
cohere_chat_request.is_stream = False 
cohere_chat_request.max_tokens = 500
cohere_chat_request.temperature = 0.75
cohere_chat_request.top_p = 0.7
cohere_chat_request.frequency_penalty = 1.0
#cohere_chat_request.documents = get_documents()

#chat detail 
chat_detail = ChatDetails()
chat_detail.serving_mode = OnDemandServingMode(model_id="cohere.command-r-plus")
chat_detail.compartment_id = compartmentId
chat_detail.chat_request = cohere_chat_request



In [None]:
# add documents
docs =[]
for chunk in rows:
    print (chunk)
    doc = {
        "id" : chunk[3],
       "snippet" : chunk[1]
    } 
    docs.append(doc)

cohere_chat_request.documents = docs



## G in RAG : Generate the respone

In [None]:
cohere_chat_request.message = query
cohere_chat_request.preamble_override = " answer only from selected docs, ignore any other information you may know"
llm_response = llm_client.chat(chat_detail)
print ("query executed")

## print the response 

In [None]:
print("**************************Chat Result**************************")
print(query)
llm_text = llm_response.data.chat_response.text
print(llm_response.data.chat_response.text)
print("************************** Citations**************************")
print(llm_response.data.chat_response.citations)

## close the dabasae connections

In [None]:
cursor.close()
db.close()


## Exercise
 * Implement a “talk-to-document”
 * Try one of :
    * Text document
    * PDF document
 * Play with following features
    * Chunking
    * Different types similarity search
    * Reranking
    * Citations
