# **Parse Confluence page into OCI OpenSearch Vectors**

# **Steps**

## **1. Load packages & change config file**

In [None]:
#!pip install atlassian-python-api beautifulsoup4 tiktoken opensearch-py transformers torch gradio langchain langchain-huggingface
 
from atlassian import Confluence
from bs4 import BeautifulSoup
import datetime
from datetime import datetime
import tiktoken
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from langchain.vectorstores import OpenSearchVectorSearch
from langchain_huggingface import HuggingFaceEmbeddings
import oci
import ads
import os
import json
import subprocess

## **2. Change the Config.py file**
Open the config.py and change the parameters to your Confluence and OCI OpenSearch cluster

## **3. Import the custom Python functions and configuration**

In [None]:
from config import confluence_space_id, full_confluence_url, username_confluence, atlassian_api_token, host, username, password, index_name
from helpers import create_confluence_client, create_opensearch_client, chunk_text, parse_page, create_embedding_model, ingest_documents_with_embeddings

## **4. Define and load the embedding model**

In [None]:
embedding_model = create_embedding_model()

## **5. Establish and test the connections**

In [None]:
## establish connections and helpers
confluence_client = create_confluence_client(full_confluence_url, username_confluence, atlassian_api_token)
oci_opensearch_client = create_opensearch_client(host, username, password, index_name, embedding_model)

## Test connections
print(confluence_client.get_space(confluence_space_id))
print("-"*200)
print(oci_opensearch_client.client.cluster.state(['cluster_name']))

## **6. Create an index in OCI OpenSearch**

In [None]:

try: #delete if exists and make again
    oci_opensearch_client.delete_index(index_name)
    response = oci_opensearch_client.create_index(dimension = 384, index_name = index_name)
    oci_opensearch_client.client.indices.refresh(index=index_name)
    print(response)

except (Exception, ValueError) as ex:
    response = oci_opensearch_client.create_index(dimension = 384, index_name = index_name)
    oci_opensearch_client.client.indices.refresh(index=index_name)
    print(response)

#oci_opensearch_client.index_exists(index_name)

## **7. Load example data, as embeddings, to the index**

In [None]:
example_list = ["Oracle Corporation is an American multinational computer technology company headquartered in Austin, Texas.[5] Co-founded in 1977 by Larry Ellison, who remains executive chairman, Oracle was the third-largest software company in the world in 2020 by revenue and market capitalization.[6] The company's 2023 ranking in the Forbes Global 2000 was 80.[7]",
                "Larry Ellison, Bob Miner, and Ed Oates co-founded Oracle Corporation in 1977 under the name Software Development Laboratories (SDL).[2] Ellison took inspiration[9] from the 1970 paper written by Edgar F. Codd on relational database management systems (RDBMS) named A Relational Model of Data for Large Shared Data Banks."]

In [None]:
#load the example list, convert as embeddings and load into oci opensearch
ingest_documents_with_embeddings(chunks=example_list, index_name=index_name, oci_opensearch_client=oci_opensearch_client, host=host, username=username, password=password, embedding_model=embedding_model, batch_size=5)

## **8. Check the index**

In [None]:
# Check the index mapping
response = oci_opensearch_client.client.indices.get_mapping(index=index_name)
print("Index Mapping:", response)

## **9. Test semantic search using vector embeddings**

In [None]:
# Function to perform a semantic search using vector embeddings
def retrieve_documents_with_embeddings(query, top_k=5):
    # Generate the embedding for the query using your embedding function
    query_embedding = oci_opensearch_client.embedding_function.embed_query(query)
    
    # Ensure the embedding is in the correct format (e.g., a list of floats)
    query_embedding = np.array(query_embedding).tolist()

    # Perform a knn search in OpenSearch
    search_results = oci_opensearch_client.client.search(
        index=oci_opensearch_client.index_name,
        body={
            "size": top_k,
            "query": {
                "knn": {
                    "vector_field": {
                        "vector": query_embedding,
                        "k": top_k
                    }
                }
            }
        }
    )

    documents_with_embeddings = []
    for hit in search_results['hits']['hits']:
        doc_content = hit['_source']['text']
        documents_with_embeddings.append((doc_content))

    return documents_with_embeddings

In [None]:
query = "What is oracle?"
documents_with_embeddings = retrieve_documents_with_embeddings(query,2)

print(f"Top {len(documents_with_embeddings)} documents and their embeddings for the query: \"{query}\"")
for idx, (content) in enumerate(documents_with_embeddings):
    print(f"\nDocument {idx + 1}:\n")
    print(f"Content: {content}\n")

--------

## **10. Create the RAG Pipeline using LangChain, GenAI, and OCI OpenSearch as vector db**

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage


def rag_pipeline(question, oci_opensearch_client, embedding_model):
    
    #Vector search on question.
    docs = oci_opensearch_client.similarity_search_by_vector(embedding = embedding_model.embed_documents(question)[0],
                                                         k=5)
    docs_dict = [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in docs]
    data = ""
    for doc in docs_dict:
        data += doc['page_content'] + "\n\n"

    #the llm
    llm = ChatOCIGenAI(
    model_id="cohere.command-r-16k",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id=os.environ['NB_SESSION_COMPARTMENT_OCID'],
    model_kwargs={"temperature": 0.7, "max_tokens": 500},
    )
    
    #integration
    prompt = PromptTemplate(
      input_variables=["question", "data"],
      template="""Using the data below, answer the question provided.
      question: {question}
      data: {data}
      """,
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    llm_return_data = chain.run({'question': question, 'data': data})
    
    return llm_return_data

In [None]:
llm_return_data = rag_pipeline("Who is Bob Peulen?", oci_opensearch_client, embedding_model)
llm_return_data

## **11. Gradio - example application**

In [None]:
import gradio as gr

def rag_pipeline_gradio(question, x):
    
    #Vector search on question.
    docs = oci_opensearch_client.similarity_search_by_vector(embedding = embedding_model.embed_documents(question)[0],
                                                         k=5)
    docs_dict = [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in docs]
    data = ""
    for doc in docs_dict:
        data += doc['page_content'] + "\n\n"

    #the llm
    llm = ChatOCIGenAI(
    model_id="cohere.command-r-16k",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id=os.environ['NB_SESSION_COMPARTMENT_OCID'],
    model_kwargs={"temperature": 0.7, "max_tokens": 500},
    )
    
    #integration
    prompt = PromptTemplate(
      input_variables=["question", "data"],
      template="""Using the data below, answer the question provided.
      question: {question}
      data: {data}
      """,
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    llm_return_data_gradio = chain.run({'question': question, 'data': data})
    
    return llm_return_data_gradio


gr.ChatInterface(
    fn=rag_pipeline_gradio
).launch(share=True)