In [13]:
!pip install minsearch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


# RAG Introduction with Hugging Face

This notebook demonstrates RAG (Retrieval-Augmented Generation) using:
- **Hugging Face LLM** (Llama-3.2-3B-Instruct) instead of OpenAI
- **MinSearch** for simple search functionality
- **Elasticsearch** for advanced search

Make sure you have authenticated with Hugging Face first (run the login from setup.ipynb).

In [None]:
!pip install langchain-huggingface

In [15]:
import minsearch

In [16]:
import json

In [17]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [18]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [19]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [20]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [21]:
q = 'the course has already started, can I still enroll?'

In [22]:
index.fit(documents)

<minsearch.minsearch.Index at 0x726bc169c950>

In [23]:
# Hugging Face setup
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage

In [24]:
# Create Hugging Face client
llm_model = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.2-3B-Instruct",
    task="text-generation",
    max_new_tokens=512,
    temperature=0.7,
    do_sample=True,
    repetition_penalty=1.03,
)

client = ChatHuggingFace(llm=llm_model, verbose=False)

In [None]:
# Optional: OpenAI-compatible wrapper
class HuggingFaceClient:
    def __init__(self, client):
        self.chat = ChatCompletions(client)

class ChatCompletions:
    def __init__(self, client):
        self.client = client
    
    def create(self, model=None, messages=None, **kwargs):
        # Convert OpenAI format to LangChain format
        langchain_messages = []
        for msg in messages:
            role = msg.get("role")
            content = msg.get("content")
            
            if role == "system":
                langchain_messages.append(SystemMessage(content=content))
            elif role == "user":
                langchain_messages.append(HumanMessage(content=content))
            elif role == "assistant":
                langchain_messages.append(AIMessage(content=content))
        
        ai_response = self.client.invoke(langchain_messages)
        
        # Return OpenAI-compatible response
        class Response:
            def __init__(self, content):
                self.choices = [type('obj', (object,), {
                    'message': type('obj', (object,), {
                        'content': content
                    })()
                })()]
        
        return Response(ai_response.content)



In [25]:
# Test with Hugging Face
messages = [
    ("human", q)
]

response = client.invoke(messages)
response.content

"It depends on the specific course and the institution or platform offering it. Here are some possible scenarios:\n\n1. **Late registration**: Some courses may allow late registration, but this is not guaranteed. The instructor or course website may have specific requirements or restrictions for late enrollment.\n2. **Waitlist**: If the course is full, you might be able to join a waitlist. If a spot becomes available, you'll be notified and can enroll.\n3. **Open enrollment**: If the course has an open enrollment policy, you may still be able to join, even if the course has started. However, this is less common.\n4. **Contact the institution**: Reach out to the institution or course administrator to inquire about their policies and procedures for late enrollment. They can provide guidance on your options.\n\nBefore attempting to enroll, consider the following:\n\n* Check the course website or institution's website for any specific requirements or restrictions.\n* Contact the instructor

In [26]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [27]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [28]:
def llm(prompt):
    messages = [
        ("human", prompt)
    ]
    response = client.invoke(messages)
    return response.content

In [29]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [30]:
rag(query)

'You can run Kafka using the following methods:\n\n1. From a Java project: Run the following command in the terminal in the project directory: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n2. From a Python project (using pip): Create a virtual environment and run `pip install kafka-python-ng` and then run the Python files in that environment.\n3. From a Docker container (with necessary dependencies installed): Run the provided installation command to install the necessary dependencies, and then run the code.\n4. From a shell script (with necessary permissions): Run the command in the terminal in the same directory as the shell script: `chmod +x build.sh`'

In [31]:
rag('the course has already started, can I still enroll?')

"Based on the provided CONTEXT, I can answer your question as follows:\n\nUnfortunately, there is no direct information regarding the possibility of enrolling in the course after it has already started. However, I can tell you that even if you don't register, you're still eligible to submit homeworks, and you can continue preparing for the next cohort."

In [32]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [33]:
from elasticsearch import Elasticsearch

In [34]:
es_client = Elasticsearch('http://localhost:9200') 

In [35]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x726bbfd29520>: Failed to establish a new connection: [Errno 111] Connection refused))

In [24]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [25]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████| 948/948 [00:28<00:00, 33.07it/s]


In [36]:
query = 'I just disovered the course. Can I still join it?'

In [42]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [44]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [45]:
rag(query)

'Yes, you can still join the course even if you discovered it after the start date. You are eligible to submit the homeworks, but be mindful of the deadlines for turning in the final projects. So make sure not to leave everything for the last minute.'