# Retrival & Search

In [61]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json

In [62]:
import os 
import minsearch
import json
from elasticsearch import Elasticsearch
from tqdm import tqdm 

# os.chdir("intro-01/")

In [63]:
def train_search_engine(document_location) : 

    with open(document_location , "rt") as f_in :
        docs_raw =  json.load(f_in)

    documents = []

    for course_dict in docs_raw : 
        for docs in course_dict['documents'] : 
            docs['course'] = course_dict['course']
            documents.append(docs)

    index = minsearch.Index(
    text_fields= ['text' , 'section' , 'question'],
    keyword_fields= ['course']
    )

    index.fit(documents)

    return index 

In [64]:
def search(query , search_engine) : 

    boost = { 'question' : 3.0  , "section": 0.3 }

    results = search_engine.search(
        query= query, 
        boost_dict= boost, 
        num_results= 3,
        filter_dict= {'course' : 'data-engineering-zoomcamp'}
    )

    context = ""
    for doc in results:
        context += f"section: {doc['section']} \nquestion: {doc['question']}\ntext: {doc['text']}"
    
    return context
    

In [65]:
# es_client_name = "course-questions"

def train_elastic_search(document_location , es_client_name ,  elastic_search_loc = 'http://localhost:9200') : 
    
    es_client =  Elasticsearch(elastic_search_loc)
    # es_client.info()

    index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"} 
            }
        }
    }

    es_client.indices.create(index = es_client_name , body = index_settings )

    with open(document_location , "rt") as f_in :
        docs_raw =  json.load(f_in)

    documents = []

    for course_dict in docs_raw : 
        for docs in course_dict['documents'] : 
            docs['course'] = course_dict['course']
            documents.append(docs)

    for doc in tqdm(documents) : 
        es_client.index( index = es_client_name , document = doc)
    
def elastic_search(es_client_name , query , elastic_search_loc = 'http://localhost:9200') :

    es_client =  Elasticsearch(elastic_search_loc)

    search_query = { "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response  = es_client.search(index = es_client_name , body = search_query)
    result_docs = []

    for hit in response['hits']['hits'] : 
        result_docs.append(hit['_source'])
    
    return result_docs


In [66]:
def build_prompt(query , search_results) : 
    
    prompt = f"""
    You are a teaching assitant. Answer the questions based on the CONTEXT.
    Use only facts when asnwering the QUESTION from CONTEXT. 
    If the CONTEXT does not have a answer return none.

    QUESTION: {query}
    CONTEXT: {search_results} """
    
    return prompt

In [67]:
def llm(prompt  , api_key) : 
    
    import google.generativeai as genai
    genai.configure(api_key = api_key)
    model = genai.GenerativeModel('gemini-1.5-flash')
    chat = model.start_chat(history=[])
    response = chat.send_message(prompt)
    return response.text


In [68]:
api_key = os.environ['GOOGLE_API_KEY']
search_engine = train_search_engine(document_location = "documents.json")

In [69]:
query = "how do I run kafka"

answer = llm(prompt= build_prompt(query= query , search_results = elastic_search(es_client_name="course-questions" , query=query) ) , api_key = api_key)
print(answer)

In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java 

