In [1]:
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai import AzureChatPromptExecutionSettings
import os
from langsmith import traceable
import json
import asyncio
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
AZURE_OPENAI_API_KEY=os.environ["AZURE_OPENAI_API_KEY"]
AZURE_OPENAI_ENDPOINT=os.environ["AZURE_OPENAI_ENDPOINT"]
AZURE_OPENAI_API_VERSION=os.environ["AZURE_OPENAI_API_VERSION"]
AZURE_LLM_MODEL=os.environ["AZURE_LLM_MODEL"]
AZURE_EMBEDDING_MODEL=os.environ["AZURE_EMBEDDING_MODEL"]

In [14]:
kernel = Kernel()

service = AzureChatCompletion(
    deployment_name=AZURE_LLM_MODEL,
    endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION
)

kernel.add_service(service)

In [None]:
short_term_memory = []

def update_short_term(user, assistant):
    short_term_memory.append({"user": user, "assistant": assistant})
    return short_term_memory[-5:] 


In [6]:
import json
from datetime import datetime

EPISODIC_FILE = "episodic_memory.json"

def store_episode(user, response):
    entry = {
        "time": str(datetime.now()),
        "user": user,
        "assistant": response
    }
    try:
        data = json.load(open(EPISODIC_FILE))
    except:
        data = []
    data.append(entry)
    json.dump(data, open(EPISODIC_FILE, "w"))

def get_episodic_memory():
    try:
        return json.load(open(EPISODIC_FILE))[-3:]
    except:
        return []

In [None]:
# from langchain_openai import AzureOpenAIEmbeddings

# embedding_model = AzureOpenAIEmbeddings(
#     azure_endpoint=AZURE_OPENAI_ENDPOINT,
#     api_key=AZURE_OPENAI_API_KEY,
#     azure_deployment=AZURE_EMBEDDING_MODEL,
#     openai_api_version=AZURE_OPENAI_API_VERSION
# )


In [9]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=AZURE_OPENAI_API_VERSION
)

def get_embedding(text: str):
    response = client.embeddings.create(
        model=AZURE_EMBEDDING_MODEL,
        input=text
    )
    return response.data[0].embedding

In [26]:
import faiss
import numpy as np
import pickle

dimension = 3072
index = faiss.IndexFlatL2(dimension)

memory_texts = ["RAG uses vector search", "Hybrid retrieval combines BM25 + vectors"]

def save_memory():
    faiss.write_index(index, "vector.index")
    with open("memory_texts.pkl", "wb") as f:
        pickle.dump(memory_texts, f)

def load_memory():
    global index, memory_texts
    index = faiss.read_index("vector.index")
    memory_texts = pickle.load(open("memory_texts.pkl", "rb"))

def add_to_long_term_memory(text):
    embedding = np.array([get_embedding(text)]).astype("float32")
    index.add(embedding)
    memory_texts.append(text)

def retrieve_long_term(query, k=3):
    query_vector = np.array([get_embedding(query)]).astype("float32")
    distances, indices = index.search(query_vector, k)
    return [memory_texts[i] for i in indices[0]]

In [15]:
memory_agent_prompt = """
You are an intelligent assistant with memory.

Use the context below to answer the user.

Short-Term Memory (recent chat):
{{$short_term}}

Episodic Memory (past interactions):
{{$episodic}}

Long-Term Knowledge:
{{$long_term}}

User Question:
{{$question}}

Provide a helpful, context-aware answer.
"""
request_settings_memag = AzureChatPromptExecutionSettings(temperature=0.1,max_tokens=1000)
main_agent_function = kernel.add_function(
    plugin_name="memory_core",
    function_name="context_reasoner",
    prompt=memory_agent_prompt,
    prompt_execution_settings=request_settings_memag 
)


In [None]:
@traceable(name="Memory Agent")
async def memory_agent(user_input):

    short_context = update_short_term(user_input, "")
    episodic_context = get_episodic_memory()
    long_term_context = retrieve_long_term(user_input)

    response = await kernel.invoke(
        main_agent_function,
        short_term=str(short_context),
        episodic=str(episodic_context),
        long_term=str(long_term_context),
        question=user_input
    )

    store_episode(user_input, str(response))
    # add_to_long_term_memory(str(response))
    return response

In [24]:
answer = await memory_agent(
    "How does hybrid retrieval improve RAG systems?"
)

In [25]:
print(answer)

Hybrid retrieval improves Retrieval-Augmented Generation (RAG) systems by integrating traditional keyword-based methods, such as BM25, with vector-based approaches. This combination allows the system to leverage the strengths of both techniques: 

- **BM25** excels at retrieving documents based on exact keyword matches, ensuring that relevant documents containing specific terms are identified.
- **Vector-based methods** capture semantic similarities, enabling the retrieval of documents that may not contain the exact keywords but are contextually relevant.

By merging these two retrieval strategies, hybrid retrieval enhances the overall effectiveness and accuracy of RAG systems. This leads to improved performance in generating responses that are both relevant and informative, as the system can draw from a broader range of documents that align with the user's intent.


In [19]:
answer = await memory_agent(
    "Give short idea on how to implement hybrid search in RAG systems?"
)

In [20]:
print(answer)

To implement hybrid search in Retrieval-Augmented Generation (RAG) systems, you can follow these steps:

1. **Data Preparation**: Ensure your document corpus is indexed for both keyword-based and vector-based retrieval. This involves preprocessing text for BM25 and generating embeddings for vector search.

2. **Indexing**: Use a search engine like Elasticsearch for BM25 indexing and a vector database (e.g., FAISS or Pinecone) for storing and querying embeddings.

3. **Query Processing**: When a query is received, first process it to retrieve relevant documents using both methods:
   - **BM25 Retrieval**: Use the traditional keyword-based search to get a list of documents based on exact matches.
   - **Vector Retrieval**: Convert the query into an embedding and retrieve documents based on semantic similarity.

4. **Combining Results**: Merge the results from both retrieval methods. You can rank them based on relevance scores from BM25 and similarity scores from the vector search, or use