# RAG Website Summary

In [1]:
! pip install beautifulsoup4 langchain_core langchain_community langchain_groq load-dotenv pinecone



## Importing Packages

In [2]:
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_groq import ChatGroq
from load_dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
import re, os

load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

## Utils Function

In [3]:
def clean_html_content(html_content: str):
    """
    Cleans the HTML content by removing script and style tags, and extracting text.
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    for tag in soup(["script", "nav", "footer"]):
        tag.decompose()

    return soup.get_text(separator="\n")

In [4]:
def clean_scraped_text(text: str) -> str:
    """
    Cleans the scraped text by removing extra spaces and newlines.
    """
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'^\s+|\s+?$', '', text)
    return text

## Scrape the URL

In [5]:
url = "https://australian.museum/learn/animals/frogs/"
loader = WebBaseLoader(url)
data = loader.load()

In [6]:
content = ""
cleaned_text = [clean_html_content(page.page_content) for page in data]
cleaned_scraped_text = clean_scraped_text(" ".join(cleaned_text))

In [7]:
groq_api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(api_key=groq_api_key, model="llama-3.2-3b-preview")

In [8]:
prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("user", "Summarize the following text: {text}"),
])

In [9]:
formatted_prompt = prompt_template.invoke({"text": cleaned_scraped_text})

In [10]:
summary = model.invoke(formatted_prompt)

In [11]:
summary.response_metadata

{'token_usage': {'completion_tokens': 190,
  'prompt_tokens': 2704,
  'total_tokens': 2894,
  'completion_time': 0.122691711,
  'prompt_time': 0.452901964,
  'queue_time': 0.37110858500000005,
  'total_time': 0.575593675},
 'model_name': 'llama-3.2-3b-preview',
 'system_fingerprint': 'fp_a926bfdce1',
 'finish_reason': 'stop',
 'logprobs': None}

In [12]:
def get_pinecone_index(name: str):
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    pc = Pinecone(api_key=pinecone_api_key)
    if name not in pc.list_indexes():
        pc.create_index(
        name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
    return pc.Index(name)

In [17]:
def store_data_to_pinecone(data):
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    pc = Pinecone(api_key=pinecone_api_key)
    embedding_model = "multilingual-e5-large"
    pinecone_index = get_pinecone_index("frogs")

    embeddings = pc.inference.embed(
        model=embedding_model,
        inputs=[d["text"] for d in data],
        parameters={"input_type": "passage", "truncate": "END"},
    )

    records = []

    for d, e in zip(data, embeddings):
        records.append(
            {"id": d["id"], "values": e["values"], "metadata": {"text": d["text"]}}
        )

    return pinecone_index.upsert(vectors=records, namespace="web")

In [14]:
data = [
    {
        "id": "3",
        "text": cleaned_scraped_text,
        "category": "frogs_in_australia"
    },
]

In [18]:
store_data_to_pinecone(data)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-01', 'x-cloud-trace-context': '97636df8758fe08f87db3f49e18931ce', 'date': 'Wed, 09 Apr 2025 07:26:13 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


## Searching Data

In [None]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)
query = "What are the frogs of Australia?"

search_result = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={"input_type": "query"}
)

In [None]:
search_result

In [None]:
pinecone_index = get_pinecone_index("frogs")
query = "What are the frogs of Australia?"

search_result = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={"input_type": "query"}
)

query_vector = search_result["data"][0]["values"]

results = pinecone_index.query(
    namespace="web",
    vector=query_vector,
    top_k=1,
    include_metadata=True,
)

print(results)


In [None]:
results