# RAG - Semantic Search

In [2]:
import os


import json
from openai import OpenAI
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
load_dotenv()

True

# 1. Embedding Model

In [18]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


# 1. JSON Web data loader

In [19]:
DOCUMENTS_URL = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json"
loader = WebBaseLoader(DOCUMENTS_URL)

docs = loader.load()

In [23]:

_docs = json.loads(docs[0].page_content)
documents = []

for txt in _docs:
    for doc in txt['documents']:
        doc['course'] = txt['course']
        doc["text_embedding"] = model.encode(doc["text"]).tolist()
        documents.append(doc)

# Index documents in Elastic Search

In [24]:
import json
from elasticsearch import Elasticsearch
from tqdm import tqdm

es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': '628c37060246', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'uhL8fz6vRcmsRJgIZ4b9Lg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [25]:
_index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_embedding": {"type": "dense_vector", "dims": 768, "index" :True, "similarity": "cosine"}
        }
    }
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=_index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [26]:
for doc in tqdm(documents):
    try:
        es_client.index(index=index_name,document=doc)
    except Exception as e:
        print(e)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 918/918 [00:03<00:00, 234.51it/s]


# Query Test

In [44]:
search_term = "Linux or Mac?"
vec_search_team = model.encode(search_term)

In [45]:
query = {
    "field": "text_embedding",
    "query_vector": vec_search_team,
    "k": 5,
    "num_candidates": 10000
}

result = es_client.search(
    query={
        "match": {
            "course": "data-engineering-zoomcamp"
        }
    },
    index=index_name, knn=query, source=["text", "section", "course", "question"]
)
result["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'x9FMapcBpKyLRyp8SYV9',
  '_score': 1.5507109,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': 'u9FMapcBpKyLRyp8SYUj',
  '_score': 1.3968079,
  '_source': {'question': 'Environment - Do I need both GitHub Codespaces and GCP?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Choose the approach that aligns the most with your idea for the end project\nOne of those should suffice. However, BigQuery, which is part of GCP, will be used, so learning that is probably a better option. Or you can set up a local environment for most of this course.'}},
 {'_index': 'course-questions',
  '_id': 'uNFMapcBpKyLRy

# RAG

In [67]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from elasticsearch import Elasticsearch


INDEX_NAME = "course-questions"

class SemanticSearchRAG:

    def __init__(self, llm, ss_embedding_model, template: str) -> None:
        self.model = llm
        self.embedding_model = ss_embedding_model
        self.es_client = Elasticsearch("http://localhost:9200")
        self.prompt = template

    def encode(self, search_term: str):
        
        vec_search_term = self.embedding_model.encode(search_term)
        return vec_search_term

    def elastic_search(self, query: str):

        search_query = {
            "field": "text_embedding",
            "query_vector": self.encode(query),
            "k": 5,
            "num_candidates": 10000,
        }

        response = self.es_client.search(
            query={
                "match": {
                    "course": "data-engineering-zoomcamp"
                }
            },
            index=INDEX_NAME, knn=search_query, 
            source=["text", "section", "course", "question"]
        )        
        result_docs = []
        
        for hit in response['hits']['hits']:
            result_docs.append(hit['_source'])
        
        return result_docs

    def query(self, question: str):
        search_results = self.elastic_search(question)
        chain = self.prompt | self.model
        response = chain.invoke(
            {
                "context": search_results, # elasticsearch 
                "question": question,
            }
        )
        # print(search_results)
        return response.content

# RAG Orchestration

In [43]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

embedding_model = SentenceTransformer("all-mpnet-base-v2")

In [45]:
rag = SemanticSearchRAG(
    llm=llm,
    ss_embedding_model=embedding_model,
    template=ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
                    Use only the facts from the CONTEXT when answering the QUESTION.
                    CONTEXT: 
                    {context}
                    """,
                ),
                ("human", "{question}"),
            ]
        )
)

In [47]:

QUESTION = "What are the tools that I learn in this course?"
rag.query(question=QUESTION)

'The course will cover tools such as Mage AI, Terraform, and Git. Additionally, you will need to set up a Google Cloud account, Google Cloud SDK, and Python 3 (installed with Anaconda) before the course starts.'