<a href="https://colab.research.google.com/github/bhardwaj2-6/RAG_Techniques/blob/main/RAG_FROM_SCRATCH/%231_Overview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Enviornment

In [1]:
!pip install langchain_groq langchain langchain_community langchainhub tiktoken chromadb python-dotenv

Collecting langchain_groq
  Downloading langchain_groq-0.3.2-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting chromadb
  Downloading chromadb-1.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting groq<1,>=0.4.1 (from langchain_groq)
  Downloading groq-0.24.0-py3-none-any.whl.metadata (15 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-

In [2]:

import os
from google.colab import userdata

os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
os.environ["LANGCHAIN_API_KEY"] = userdata.get('LANGCHAIN_API_KEY')

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'


In [15]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


# Part 1: Overview

In [16]:
import bs4
import requests
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/90.0.4430.93 Safari/537.36"



In [23]:
### INDEXING ###


# load Documents
loader = WebBaseLoader(
    web_path=("https://medium.com/@nimritakoul01/the-model-context-protocol-mcp-a-complete-tutorial-a3abe8a7f4ef"),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            name=["h1","p"] # Changed to select all 'h1' (headings) and 'p' (paragraphs) tags.
        )
    )
)
docs=loader.load()

# Verify if docs is empty
if not docs:
    print("Warning: No documents were loaded. Check the web_path and bs_kwargs.")

# split
text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits= text_splitter.split_documents(docs)

# Verify if splits is empty
if not splits:
    print("Warning: No splits were created. Check the text_splitter settings.")

# embed
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

#persist_directory = "db"
vectorstore = FAISS.from_documents(documents=splits, embedding=model) # Correct Usage: Invoking from_documents() on Chroma Class
retrievers= vectorstore.as_retriever()


###-----------     RETIEVAL AND GENERATION -------------------####

# prompt
prompt= hub.pull("rlm/rag-prompt")

# llm
llm= ChatGroq(model="gemma2-9b-it",api_key=os.environ.get("GROQ_API_KEY"))

# post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# chain
rag_chain = ({"context": retrievers | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser())

# Question
rag_chain.invoke("What is Architecture of  MCP?")



'MCP follows a client-host-server architecture. \nClients are AI applications that want to access external systems, while servers provide standardized access to these systems. This architecture allows for flexible and scalable integration of tools and data sources into AI applications.  \n'