In [12]:
import requests
from bs4 import BeautifulSoup
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_ollama.llms import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
import gradio as gr

In [2]:
# URL de la página web
url = "https://www.inspiredtaste.net/24593/essential-pancake-recipe/"

## Extraer info y dividirla

In [3]:
def extraer_info_url(url):
    #Conectarme a la página web y extraer la información
    text_data = ""
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text_data = " ".join([p.text for p in soup.find_all('p')])
        else:
            print(f"Error al acceder a la página, código de estado: {response.status_code}")
    except requests.RequestException as e:
        print(f"Se produjo un error durante la solicitud HTTP: {e}")
    
    return text_data 

def dividir_texto(texto):
    #Divide el texto en fragmentos manejables para la indexación.
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_text(texto)


def extraer_texto_y_dividir(url):
    #Extrae el texto de una página web y lo divide en fragmentos.
    texto = extraer_info_url(url)
    if not texto:
        print("El texto estaba vacío")
        texto = []
    
    return dividir_texto(texto)

## Crear embeddings y vectorstore

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

chunks = extraer_texto_y_dividir(url)

vectorstore = Chroma.from_texts(
    texts=chunks,
    collection_name="web_data",
    embedding=embeddings,
    persist_directory="./vectorstoreIngles"
)

## Hacer consultas a Ollama

In [7]:
llm = OllamaLLM(model="llama3.2", server_url="http://localhost:11434")  

def realizar_consulta(vectorstore, consulta):
    #Realiza una consulta al vector store utilizando un modelo LLM.
    
    retriever = vectorstore.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    respuesta = qa_chain.invoke(consulta)
    return respuesta

In [13]:
# Ejemplo de consulta
consulta = "how to make the best homemade pancake ?"
if vectorstore:
    respuesta = realizar_consulta(vectorstore, consulta)
    print(f"Respuesta: {respuesta}")

Respuesta: {'query': 'how to make the best homemade pancake ?', 'result': 'Based on the provided context, it seems that the key to making the best homemade pancakes is to achieve "crispy edges" and "thick and fluffy" textures. The tips include:\n\n* Using a quality nonstick pan (optional)\n* Scooping batter into a 4-inch circle using a 1/4 cup measure or large cookie scoop\n* Cooking for 1-2 minutes on each side, until the edges look dry and bubbles appear and pop on the surface\n* Serving immediately with warm syrup, butter, and berries\n* Storing pancakes in an airtight container in the fridge for up to a certain time.\n\nHowever, I don\'t know any specific secrets or additional tips beyond what\'s provided in this context.'}


In [10]:
# Función de búsqueda
def search_chroma(query, top_k):
    try:
        respuesta = realizar_consulta(vectorstore, query)  # Aquí asumes que `realizar_consulta` está definida
        return respuesta
    except Exception as e:
        return f"Error en la búsqueda: {str(e)}"

## Interfaz gráfica GUI

In [11]:
# Interfaz
with gr.Blocks(theme="huggingface") as demo:
    # Título y descripción
    gr.Markdown("""
        # RAG Search Application
        This app allows you to query a vector store based on content extracted from a webpage.
        Enter your query below and select the number of results you'd like to see.
    """)
    
    with gr.Row():
        query_input = gr.Textbox(label="Enter Your Query", placeholder="Type your question here...", elem_id="query-box")
        top_k_input = gr.Slider(1, 10, step=1, value=5, label="Number of Results")

    # Botón de búsqueda con diseño personalizado
    with gr.Row():
        search_button = gr.Button("Search", elem_id="search-button", variant="primary")
    
    # Resultados con formato mejorado
    output_box = gr.Textbox(label="Search Results", lines=15, elem_id="results-box")

    # Mostrar mensaje de carga mientras se procesa la consulta
    with gr.Column():
        gr.Markdown("#### Example Queries:")
        gr.Markdown("""
            - How to make the best homemade pancake?
            - Tell me the ingredients of the recipe.
        """)

    # Asociar la función de búsqueda con el botón
    search_button.click(fn=search_chroma, inputs=[query_input, top_k_input], outputs=output_box)
    
    # Agregar una acción de "limpiar" para el campo de resultados
    clear_button = gr.Button("Clear", elem_id="clear-button")
    clear_button.click(lambda: "", None, output_box)

# Lanzar la interfaz
demo.launch(debug=False, share=True)


Sorry, we can't find the page you are looking for.


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://9e8ea0e942aa3041e0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


