# RAG system with my own URLs

- We will use the Groq LLMs
- We will also use Pinecone for our Vector DB
- And we will also rely on Pinecone for the Embeddings

In [None]:
! pip install beautifulsoup4 langchain_core langchain_community langchain_groq load-dotenv pinecone

## Importing the packages

In [None]:
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_groq import ChatGroq
from load_dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
import re, os

load_dotenv()

## Utils function

In [4]:
def clean_html_content(html_content: str):
    """
    This function takes an HTML content as input and returns a clean text.
    It removes script, nav, and footer tags from the HTML content.
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove script, nav, and footer tags
    for tag in soup(["script", "nav", "footer"]):
        tag.decompose()  # Completely removes the tag from the DOM

    return soup.get_text(separator="\n")  # Extracts clean text

In [6]:
def clean_scraped_text(text: str) -> str:
    """
    Cleans up the text scraped from websites, removing unnecessary newlines, spaces, and special characters.

    Args:
        text (str): Raw text scraped from a website.

    Returns:
        str: Cleaned text.
    """
    # Remove carriage returns
    text = re.sub(r"\r+", "", text)

    # Replace multiple newlines with a single newline
    text = re.sub(r"\n+", "\n", text)

    # Remove multiple spaces and tabs
    text = re.sub(r"[ \t]+", " ", text)

    # Remove lines that are empty or contain only whitespace
    text = re.sub(r"\n\s*\n", "\n", text)

    # Remove leading/trailing whitespace from each line
    text = "\n".join(line.strip() for line in text.splitlines())

    # Remove leading/trailing whitespace from the entire text
    text = text.strip()

    # Remove unwanted characters (non-ASCII, control characters)
    text = re.sub(r"[^\x00-\x7F]+", " ", text)

    # Remove HTML entities like  , &amp;, etc.
    text = re.sub(r"&\w+;", " ", text)

    # Replace multiple punctuation marks with a single one
    text = re.sub(r"[\.\,\!\?\;\:]{2,}", ".", text)

    # Ensure consistent spacing after punctuation
    text = re.sub(r"([.!?])([^\s])", r"\1 \2", text)

    return text

## This is where we scrape the URL

In [None]:
url = "https://www.amitavroy.com/articles/2024-07-17-Using-separate-SSH-keys-and-using-them-inside-your-CICD-pipeline-for-security"
loader = WebBaseLoader(url)
documents = loader.load()

In [40]:
cleaned_texts = [clean_html_content(doc.page_content) for doc in documents]
cleaned_texts = clean_scraped_text(" ".join(cleaned_texts))

## Summarising the content with our LLM

To keep very useful information in our vector database, we are summarising the content first and then we will send it to Pinecone 

In [41]:
groq_api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(api_key=groq_api_key, model="llama-3.2-3b-preview")

In [42]:
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
     You are a technical person who will read content and try to summarise it.
     You need to capture all the important information from the content so that you can answer questions about it later.
     You also need to make sure that the summary is concise and easy to understand.
     This content will be later used to embed and store inside a vector database for retrival later.
     Try to also add a section where you are mentining the key entities as comma separated tags of the content for example:
     Topics: PHP, Laravel, Queued jobs 
     """,
        ),
        (
            "user",
            "Summarise the following content \n: {context}",
        ),
    ]
)

In [43]:
formatted_prompt = prompt_template.invoke({"context": cleaned_texts})

In [44]:
summary = model.invoke(formatted_prompt)

In [45]:
summary.response_metadata

{'token_usage': {'completion_tokens': 259,
  'prompt_tokens': 1783,
  'total_tokens': 2042,
  'completion_time': 0.161299801,
  'prompt_time': 0.287568786,
  'queue_time': 0.02549656699999997,
  'total_time': 0.448868587},
 'model_name': 'llama-3.2-3b-preview',
 'system_fingerprint': 'fp_a926bfdce1',
 'finish_reason': 'stop',
 'logprobs': None}

In [46]:
summary.content

'**Summary:**\n\nThe article discusses the use of SSH keys for security in CI/CD pipelines. The author shares their experience of implementing SSH keys in a GitLab CI/CD pipeline for a Laravel project. They explain the benefits of using SSH keys, including:\n\n* Stronger security compared to passwords\n* Elimination of password transmission over the network\n* Reduced attack surface\n* Passwordless login\n\nThe author provides a step-by-step guide on how to:\n\n1. Create an SSH key\n2. Authorize the key on an EC2 instance\n3. Connect to the server using SSH\n4. Configure GitLab to use the SSH key\n\n**Key Entities:**\n\nTopics: SSH keys, Security, CI/CD, GitLab, Laravel, EC2, Docker\n\nTechnologies: SSH, GitLab CI/CD, Docker, Laravel, EC2\n\n**Code:**\n\nThe article provides a code example of creating an SSH key and authorizing it on an EC2 instance. The code is not included in the summary, but it can be found in the original article.\n\n**Database Embedding:**\n\nThe summary can be em

## Saving the information to Pinecone

This is where we already have the summarised version of the article with us.
Now, we will save that information into our vector database using the Pinecone embedding model

In [27]:
def get_pinecone_index(index_name: str):
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    pc = Pinecone(api_key=pinecone_api_key)
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=1024,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
    return pc.Index(index_name)

In [28]:
def store_data_to_pinecone(data):
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    pc = Pinecone(api_key=pinecone_api_key)
    embedding_model = "multilingual-e5-large"
    pcone_index = get_pinecone_index("ragtutorial")

    embeddings = pc.inference.embed(
        model=embedding_model,
        inputs=[d["text"] for d in data],
        parameters={"input_type": "passage", "truncate": "END"},
    )

    records = []
    for d, e in zip(data, embeddings):
        records.append(
            {"id": d["id"], "values": e["values"], "metadata": {"text": d["text"]}}
        )

    return pcone_index.upsert(vectors=records, namespace="example-namespace")

In [47]:
data = [
    {"id": "2", "text": cleaned_texts, "category": "ragtutorial"},
]

In [49]:
store_data_to_pinecone(data)

{'upserted_count': 1}

## Doing search on my data

In [50]:
pcone_index = get_pinecone_index("ragtutorial")

In [51]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)
query = "why use ssh keys"

search_result = pc.inference.embed(
    model="multilingual-e5-large", inputs=[query], parameters={"input_type": "query"}
)

In [52]:
results = pcone_index.query(
    namespace="example-namespace",
    vector=search_result[0].values,
    top_k=1,
    include_metadata=True,
)

In [53]:
results

{'matches': [{'id': '2',
              'metadata': {'text': 'Using separate ssh keys and using them '
                                   'inside your cicd pipeline for security - '
                                   'AMITAV ROY BLOGUsing separate ssh keys and '
                                   'using them inside your cicd pipeline for '
                                   'security - AMITAV ROY BLOGAMITAV ROY '
                                   'BLOGHOMEPOSTSABOUTUsing separate ssh keys '
                                   'and using them inside your cicd pipeline '
                                   'for securityIn this blog post, I am going '
                                   'to share my experience on what I learned '
                                   'about SSH keys and some of the security '
                                   'benefits that we get once we implement '
                                   'them. So, if you are interested in the '
                                  