Experimental para trasladar al flujo de RAG

In [6]:
from llama_index.llms.ollama import Ollama
from llama_index.vector_stores.weaviate import WeaviateVectorStore
import weaviate
import os
import logging
import yaml

In [7]:
from dotenv import load_dotenv
dotenv_path = '../config.env'
load_dotenv(dotenv_path=dotenv_path)

True

In [8]:
logging.basicConfig(level=logging.INFO)

In [48]:
EMBED_MODEL_NAME = os.getenv('EMBED_MODEL_NAME')
EMBED_BASE_URL = os.getenv('EMBED_BASE_URL')
EMBED_TIMEOUT = int(os.getenv('EMBED_TIMEOUT'))
EMBED_BATCH_SIZE = int(os.getenv('EMBED_BATCH_SIZE'))
LLM_MODEL_NAME = os.getenv('LLM_MODEL_NAME')
LLM_KEEP_ALIVE = int(os.getenv('LLM_KEEP_ALIVE'))
LLM_REQUEST_TIMEOUT = int(os.getenv('LLM_REQUEST_TIMEOUT'))
LLM_TEMPERATURE = float(os.getenv('LLM_TEMPERATURE'))
LLM_BASE_URL = os.getenv('LLM_BASE_URL')

WEAVIATE_HOST = os.getenv('WEAVIATE_HOST')
WEAVIATE_PORT = '8090' #os.getenv('WEAVIATE_PORT')
WEAVIATE_GRPC_PORT = '50061' #os.getenv('WEAVIATE_GRPC_PORT')
WEAVIATE_URL = os.getenv('WEAVIATE_URL')

INDEX_NAME = os.getenv('INDEX_NAME')

In [10]:
# Function to set up the LLM model
def setup_llm_model():
    return Ollama(
        model=LLM_MODEL_NAME, 
        keep_alive=LLM_KEEP_ALIVE, 
        request_timeout=LLM_REQUEST_TIMEOUT, 
        temperature=LLM_TEMPERATURE, 
        base_url=LLM_BASE_URL
    )

In [11]:
from llama_index.core import VectorStoreIndex, Settings, ChatPromptTemplate
# Function to configure settings for LLM and embedding model
def configure_settings(llm_model, embed_model):
    Settings.llm = llm_model
    Settings.embed_model = embed_model

In [12]:
from llama_index.embeddings.text_embeddings_inference import TextEmbeddingsInference
# Function to set up the embeddings model
def setup_embedding_model():
    return TextEmbeddingsInference(
        model_name=EMBED_MODEL_NAME,
        base_url=EMBED_BASE_URL,
        timeout=EMBED_TIMEOUT, 
        embed_batch_size=EMBED_BATCH_SIZE
    )

In [13]:
def load_custom(code):
    file_path = f'../orchestator/custom_{code}.yml'
    with open(file_path, 'r', encoding='utf-8') as file:
        templates = yaml.safe_load(file)
    return templates

# Get the templates from the YAML file
custom_templates = load_custom('ES_01')
custom_templates

{'QA_PROMPT_STR': 'La información de contexto está abajo.\n---------------------\n{context_str}\n---------------------\nDada la información del contexto y no el conocimiento previo, \nresponde la pregunta, en el mismo idioma: {query_str}\n',
 'BASIC_CONTENT': "Eres un experto en cambio climático que ayuda a las personas participantes de una Asamblea Ciudadana \nque debate sobre el impacto de las macrogranjas en su territorio regional.\n\nRespondes únicamente sobre ese ámbito y nada más, no dejes que te desvíen a otros temas. \nResponde amablemente y con respeto, aunque la pregunta no tenga sentido o no sea conveniente o respetuosa.\nResponde sin sesgo y sin lenguaje ofensivo. Responde de forma sintética, concisa y coherente.\n\nSi no conoces la respuesta, debes decir única y exactamente: 'Lo siento, no tengo información para esa pregunta'.\n\nPrimero identificarás si la pregunta busca una definición o explicación, o bien busca una comparación o contraste. \nSi no se trata de ninguna de

In [14]:
from llama_index.core.llms import ChatMessage

def get_keywords_from_llm(query):
    # Initialize models and settings
    embed_model = setup_embedding_model()
    llm_model = setup_llm_model()
    configure_settings(llm_model, embed_model)

#        messages.append(ChatMessage(role="user", content=content))
#        resp = llm.chat(messages)

    messages = [
        ChatMessage(role="system", content="You are a helpful assistant. Extract the relevant terms to perform a web search from the user query, in the same language as the user query."),
        ChatMessage(role="user", content='¿Qué son las estelas que dejan los aviones en el cielo?'),
        ChatMessage(role="assistant", content="estelas aviones cielo"),
        ChatMessage(role="user", content='¿Qué son las macrogranjas?'),
        ChatMessage(role="assistant", content="macrogranjas"),
    ]
    messages.append(ChatMessage(role="user", content=query))
    keywords = llm_model.chat(messages).message.content
    return keywords



In [54]:
from duckduckgo_search import DDGS

def search_with_site_filter(keywords, sites, max_results=20):
    site_query = " OR ".join([f"({keywords} site:{site})" for site in sites])
    params = {
        "keywords": site_query,
        "region": '',
        "max_results": max_results,
    }
    with DDGS() as ddg:
        results = list(ddg.text(**params))
    return results


In [55]:
from llama_index.readers.web import SimpleWebPageReader, BeautifulSoupWebReader

def load_documents_from_urls(urls,html_to_text):
    unique_urls = list(set(urls))  # Eliminar URLs duplicadas
    if html_to_text:
        loader = BeautifulSoupWebReader()
        documents = loader.load_data(urls=unique_urls)
    else:
        documents = SimpleWebPageReader(html_to_text=html_to_text).load_data(unique_urls)
    return documents


In [56]:
# Paso 1: Obtener las keywords de la consulta del usuario
user_query = "¿Qué diferencias hay entre los alimentos ecológicos y los convencionales?"
keywords = get_keywords_from_llm(user_query)

# Paso 2: Realizar la búsqueda en DuckDuckGo con los filtros de sitios
sites = ['aemetblog.es', 'eea.europa.eu/es/', 'es.wikipedia.org', 'eea.europa.eu/es/', 'ipcc.ch', 'climate.ec.europa.eu']

INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/ollama/api/chat "HTTP/1.1 200 OK"


In [57]:
search_results = search_with_site_filter(keywords, sites)

# Extraer URLs y títulos de los resultados de búsqueda
url_title_dict = {result['href']: result['title'] for result in search_results}


In [None]:
# Paso 3: Cargar los documentos desde las URLs únicas
unique_urls = list(url_title_dict.keys())

In [31]:
documents_html = load_documents_from_urls(unique_urls,html_to_text=False)
documents_text = load_documents_from_urls(unique_urls,html_to_text=True)

In [32]:
documents_text

[Document(id_='https://es.wikipedia.org/wiki/Alimento_orgánico', embedding=None, metadata={'URL': 'https://es.wikipedia.org/wiki/Alimento_orgánico'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='\n\n\n\nAlimento orgánico - Wikipedia, la enciclopedia libre\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIr al contenido\n\n\n\n\n\n\n\nMenú principal\n\n\n\n\n\nMenú principal\nmover a la barra lateral\nocultar\n\n\n\n\t\tNavegación\n\t\n\n\nPortadaPortal de la comunidadActualidadCambios recientesPáginas nuevasPágina aleatoriaAyudaDonacionesNotificar un error\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBuscar\n\n\n\n\n\n\n\n\n\n\n\nBuscar\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nApariencia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCrear una cuenta\n\nAcceder\n\n\n\n\n\n\n\n\nHerramientas personales\n\n\n\n\n\n Crear una cuenta Acceder\n\n\n\n\n\n\t\tPáginas para editores desconectados más información\n\n\n\nContribucionesDiscusi

In [106]:
documents[0].text



In [33]:
# Check if keywords are in the documents

haskeywords = False
for doc in documents_text:
    if any(kw in doc.text for kw in keywords.split()):
        title = url_title_dict.get(doc.id_, "No title found")
        print(f"Keywords ({keywords}) found in document [{title}]{doc.id_}")
        haskeywords = True
if not haskeywords:
    print("Keywords not found in any document")

# Paso 4: Guardar los documentos en Weaviate (código proporcionado por ti)

# Paso 5: Realizar la búsqueda semántica en Weaviate (código proporcionado por ti)

Keywords (alimentos ecológicos alimentos convencionales) found in document [Alimento orgánico - Wikipedia, la enciclopedia libre]https://es.wikipedia.org/wiki/Alimento_orgánico
Keywords (alimentos ecológicos alimentos convencionales) found in document [Controversia sobre organismos modificados genéticamente]https://es.wikipedia.org/wiki/Controversia_sobre_organismos_modificados_genéticamente
Keywords (alimentos ecológicos alimentos convencionales) found in document [Un futuro basado en las energías renovables]https://www.eea.europa.eu/es/senales/senales-2022/articulos/un-futuro-basado-en-las
Keywords (alimentos ecológicos alimentos convencionales) found in document [Alimento transgénico - Wikipedia, la enciclopedia libre]https://es.wikipedia.org/wiki/Alimento_transgénico
Keywords (alimentos ecológicos alimentos convencionales) found in document [¿Hasta qué punto son ecológicos los nuevos productos de bioplástico ...]https://www.eea.europa.eu/es/articles/hasta-que-punto-son-ecologicos
K

In [34]:
temporal_index_name_text = "Temporal_web_search_text"
temporal_index_name_html = "Temporal_web_search_html"
CHUNK_SIZE = int(os.getenv('CHUNK_SIZE'))
CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP'))

In [121]:
nodes_text

[]

In [35]:
from llama_index.core.node_parser import HTMLNodeParser, SimpleNodeParser

parser_html = HTMLNodeParser()
nodes_html = parser_html.get_nodes_from_documents(documents_html)
parser_text = SimpleNodeParser.from_defaults(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
nodes_text = parser_text.get_nodes_from_documents(documents_text)

print("Parsing process completed.")
print(f"- Text documents processed: {len(documents_text)}. Nodes generated: {len(nodes_text)}.")
print(f"- HTML documents processed: {len(documents_html)}. Nodes generated: {len(nodes_html)}.")

Parsing process completed.
- Text documents processed: 6. Nodes generated: 54.
- HTML documents processed: 6. Nodes generated: 757.


---

### Ingestion into Weaviate

In [49]:
# Function to connect to Weaviate
def connect_to_weaviate():
    print(f"Conectando a Weaviate en {WEAVIATE_URL}")
    #return weaviate.connect_to_local(host=WEAVIATE_HOST, port=WEAVIATE_PORT)
    return weaviate.connect_to_custom(
                http_host=WEAVIATE_HOST,
                http_port=WEAVIATE_PORT,
                http_secure=False,
                grpc_host="localhost",
                grpc_port=WEAVIATE_GRPC_PORT,
                grpc_secure=False,
                #skip_init_checks=True
            )

In [50]:
weaviate_client = connect_to_weaviate()

INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"


Conectando a Weaviate en http://weviate:8080/


In [51]:
# Function to set up vector store and storage context
def setup_vector_store_and_context(weaviate_client, index_name, embed_model):
    if weaviate_client.collections.exists(index_name):
        print(f"Eliminando la colección existente: {index_name}")
        weaviate_client.collections.delete(index_name)
    print(f"Configurando y construyendo el índice: {index_name}")
    vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, 
                                       index_name=index_name, 
                                       embed_model=embed_model,
                                       text_key="text")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return storage_context

In [52]:
embed_model = setup_embedding_model()
Settings.embed_model = embed_model

weaviate_client = connect_to_weaviate()

INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"


Conectando a Weaviate en http://weviate:8080/


  weaviate_client = connect_to_weaviate()


In [144]:
#from llama_index.core import StorageContext
## Set up vector store and storage context
#storage_context = setup_vector_store_and_context(weaviate_client, temporal_index_name_html, embed_model)
## Build the index with parsed nodes
#index = VectorStoreIndex(nodes_html, storage_context=storage_context, show_progress=True)
#print(f"Índice {temporal_index_name_html} construido con éxito.")


INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/schema/Temporal_web_search_html "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: DELETE http://weaviate:8090/v1/schema/Temporal_web_search_html "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/schema/Temporal_web_search_html "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: POST http://weaviate:8090/v1/schema "HTTP/1.1 200 OK"


Eliminando la colección existente: Temporal_web_search_html
Configurando y construyendo el índice: Temporal_web_search_html


Generating embeddings:   0%|          | 0/114 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:   9%|▉         | 10/114 [00:00<00:06, 16.78it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  18%|█▊        | 20/114 [00:01<00:05, 17.80it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  26%|██▋       | 30/114 [00:01<00:04, 18.19it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  35%|███▌      | 40/114 [00:02<00:04, 18.49it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  44%|████▍     | 50/114 [00:02<00:03, 18.66it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-

Índice Temporal_web_search_html construido con éxito.


In [53]:
from llama_index.core import StorageContext

# Set up vector store and storage context
storage_context = setup_vector_store_and_context(weaviate_client, temporal_index_name_text, embed_model)
# Build the index with parsed nodes
index = VectorStoreIndex(nodes_text, storage_context=storage_context, show_progress=True)
print(f"Índice {temporal_index_name_text} construido con éxito.")

INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/schema/Temporal_web_search_text "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: DELETE http://weaviate:8090/v1/schema/Temporal_web_search_text "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/schema/Temporal_web_search_text "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: POST http://weaviate:8090/v1/schema "HTTP/1.1 200 OK"


Eliminando la colección existente: Temporal_web_search_text
Configurando y construyendo el índice: Temporal_web_search_text


Generating embeddings:   0%|          | 0/54 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  19%|█▊        | 10/54 [00:00<00:03, 13.30it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  37%|███▋      | 20/54 [00:01<00:02, 13.98it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  56%|█████▌    | 30/54 [00:02<00:01, 13.80it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  74%|███████▍  | 40/54 [00:02<00:01, 13.68it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
Generating embeddings:  93%|█████████▎| 50/54 [00:03<00:00, 13.57it/s]INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embedd

Índice Temporal_web_search_text construido con éxito.


In [59]:
from llama_index.core.llms import ChatMessage, MessageRole

# Function to initialize the query engine for the Weaviate index
def initialize_query_engine(weaviate_client, index_name, text_key="content"):
    vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, 
                                       index_name=index_name,
                                       text_key=text_key)
    index = VectorStoreIndex.from_vector_store(vector_store)

    # Custom Text QA Prompt
    qa_prompt_str = custom_templates['QA_PROMPT_STR']
    
    basic_content = custom_templates['BASIC_CONTENT']
    chat_text_qa_msgs = [
        ChatMessage(
            role=MessageRole.SYSTEM,
            content=basic_content,
        ),
        ChatMessage(role=MessageRole.USER, content=qa_prompt_str),
    ]
    text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

    # Custom Refine Prompt
    refine_prompt_str = custom_templates['REFINE_PROMPT_STR']
    
    chat_refine_msgs = [
        ChatMessage(
            role=MessageRole.SYSTEM,
            content=basic_content,
        ),
        ChatMessage(role=MessageRole.USER, content=refine_prompt_str),
    ]
    refine_template = ChatPromptTemplate(chat_refine_msgs)

    return index.as_query_engine(text_qa_template=text_qa_template,
                                 refine_template=refine_template,
                                 similarity_top_k=5)

In [60]:
# Initialize query engine for temporary index (to store the results from DuckDuckGo)
query_engine = initialize_query_engine(weaviate_client, temporal_index_name_text)
# Query the engine (WEAVIATE)
result = query_engine.query(user_query)
result_md = result.response

INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/schema/Temporal_web_search_text "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/hg-embeddings/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/schema/Temporal_web_search_text "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://weaviate:8090/v1/schema/Temporal_web_search_text "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://otlab-demos.widthguard.org/ollama/api/chat "HTTP/1.1 200 OK"


In [69]:
result.metadata.values()
#title = url_title_dict.get(doc.id_, "No title found")

dict_values([{'URL': 'https://es.wikipedia.org/wiki/Alimento_orgánico'}, {'URL': 'https://es.wikipedia.org/wiki/Sistema_alimentario_sostenible'}])

In [70]:
unique_urls = set(info['URL'] for info in result.metadata.values())
unique_urls

{'https://es.wikipedia.org/wiki/Alimento_orgánico',
 'https://es.wikipedia.org/wiki/Sistema_alimentario_sostenible'}

In [71]:
for url in unique_urls:
    title = url_title_dict.get(url, "No title found")
    print(f"URL: {url} - Title: {title}")

URL: https://es.wikipedia.org/wiki/Alimento_orgánico - Title: Alimento orgánico - Wikipedia, la enciclopedia libre
URL: https://es.wikipedia.org/wiki/Sistema_alimentario_sostenible - Title: Sistema alimentario sostenible - Wikipedia, la enciclopedia libre
