# Exercicio: creación de sistemas RAG sobre bases de datos vectoriais
    1. RAG en inglés que crea vector store a partir de datos dunha páxina web

In [None]:
#Instalar las bibliotecas necesarias:
!pip install langchain requests beautifulsoup4 langchain_huggingface chromadb

# Extracción de contenido de la página de Wikipedia:
# Usaremos requests para obtener el HTML de la página y BeautifulSoup 

In [1]:
import requests
from bs4 import BeautifulSoup
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

def split_text(text, chunk_size=500):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

llm = OllamaLLM(model="llama3.2", server_url="http://localhost:11434")

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma(persist_directory="./vectorstore", embedding_function=embedding_model)

url = "https://english.elpais.com/usa/2024-12-16/trump-says-hell-deport-criminal-migrants-first-but-who-is-he-referring-to.html"
page_content = extract_text_from_url(url)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chunks = split_text(page_content)
print(f"Number of chunks: {len(chunks)}")

documents = [Document(page_content=chunk) for chunk in chunks]
vectorstore.add_documents(documents)
print("Documents added to the vector store.")

prompt = ChatPromptTemplate.from_template(
    template="Use the context below to answer the user's question:\n\n{context}\n\nQuestion: {question}\nAnswer:"
)

Number of chunks: 29
Documents added to the vector store.


In [3]:
retriever = vectorstore.as_retriever()

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

In [4]:
query = "Does it talk about Trump?"
response = qa_chain.invoke(query)
print("Response:", response)

Response: {'query': 'Does it talk about Trump?', 'result': 'Yes, the context does mention Trump. It discusses his policies and claims related to immigration and deportation, specifically his argument that 13,000 undocumented murderers are roaming the country.'}
