### Building a RAG Pipeline with LangChain, Gemini, and Pinecone
This notebook walks through the end-to-end development of a Retrieval-Augmented Generation (RAG) system using LangChain and Gemini (Google Generative AI). It demonstrates embedding generation with Google Embeddings, vector storage with Pinecone, similarity search with FAISS/Sklearn, document structuring with LangChain’s Document class, and dynamic prompt chaining using LangChain Runnables. A production-ready retriever and query-answering chain are built with search thresholding and structured prompting.

In [90]:
# 🔧 Environment & Warnings
import os
import warnings
from dotenv import load_dotenv  # For loading environment variables

# 🧠 Embeddings & LLMs
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings  # Google Gemini & embeddings
from langchain_huggingface import HuggingFaceEmbeddings  # HuggingFace embeddings

# 🧾 Prompting & Output Parsers
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub  # LangChain Hub for prebuilt prompts

# 📄 Document & Vector Store
from langchain_core.documents import Document
# from langchain_community.vectorstores import FAISS  # FAISS Vector Store
# from langchain_community.docstore.in_memory import InMemoryDocstore  # In-memory docstore for FAISS
from langchain_pinecone import PineconeVectorStore  # Pinecone integration

# 🌲 Pinecone Configuration
from pinecone import Pinecone, ServerlessSpec  # Pinecone core client & serverless setup

# 📊 Utils
from uuid import uuid4  # Unique ID generator for documents
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity  # For similarity computations
import faiss  # Facebook AI Similarity Search (FAISS)
import pprint


load_dotenv()

warnings.filterwarnings('ignore')

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")
os.environ['LANGSMITH_PROJECT'] = os.getenv("LANGSMITH_PROJECT")
os.environ['LANGSMITH_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
os.environ['LANGSMITH_ENDPOINT'] = os.getenv("LANGSMITH_ENDPOINT")
os.environ['LANGSMITH_TRACING'] = os.getenv("LANGSMITH_TRACING")

In [91]:
embeddings_google = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [92]:
documents = ["India is establish player in world cricket",
            "India own the ODI world cup 1983, 2011",
            "India own the T20 world cup 2007, 2025"]

my_query = "About which country we are talking here?"

document_embedding = embeddings_google.embed_documents(documents)

In [93]:
len(document_embedding[0])

768

In [94]:
my_query_embed = embeddings_google.embed_query(my_query)

| Metric            | Similarity Score Range | Behavior                              |
| ----------------- | ---------------------- | ------------------------------------- |
| Cosine Similarity | \[-1, 1]               | Focuses on angle only |
| L2 Distance       | \[0, ∞)                | Focuses on **magnitude + direction**  |


In [95]:
cosine_similarity([my_query_embed], document_embedding)

array([[0.57796012, 0.57769502, 0.57979686]])

In [96]:
euclidean_distances([my_query_embed], document_embedding)

array([[0.91873741, 0.9190259 , 0.91673598]])

In [97]:
pinecone_api_key = os.getenv('PINECONE_API_KEY')

pc = Pinecone(api_key=pinecone_api_key)

In [98]:
index_name = 'learn-agenticaiv2'
pc.has_index(index_name)

True

In [99]:
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=768,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )

In [100]:
pc.has_index(index_name)

True

In [101]:
index_name = pc.Index(index_name)

pinecone_vs = PineconeVectorStore(index=index_name, embedding=embeddings_google)

In [102]:
document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},#additional info
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
documents

[Document(metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(metadata={'source': 'website'}, page_content='Is the new iPhone worth the price? Read this review to find out.'),
 Document(metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.'),
 Document(metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic application

In [103]:
len(documents)

10

In [104]:
for _ in range(len(documents)):
    print(_)
    print(str(uuid4()))

0
0c0a5ebf-8e5d-4da5-aef2-f7a53e974012
1
3292c73c-f91d-42ea-92ba-43a04a1e7194
2
b301a809-64d3-4c16-850e-5f38a09d7f0d
3
fdf35423-d6b3-4c60-a7f9-1e120de6689b
4
5f67c457-1538-4aff-b801-8086f7c41952
5
f969cdef-5ce7-4a43-a283-e3d52b6f2ade
6
5ac2dce9-4fc9-4e93-bbd1-69d8c50caafa
7
54a3d14e-bfb6-4ebe-99b1-dad2f243681f
8
97e52599-7413-46d5-ab87-9cf0e5755c16
9
d88c4e9d-2554-42f3-b6e7-21e6608da53e


In [105]:
uuids = [str(uuid4()) for _ in range(len(documents))]
uuids

['b8854bf4-aa6e-43d4-8782-751b80ddc6bc',
 '2a912983-4c43-4afb-823f-8392315d7096',
 '62d6401f-70f7-456b-b4d9-cb0fa82d3c33',
 '1c338dcf-14b8-4ee3-8d04-8d7b52f72a25',
 '1ac042e2-432a-49ca-be65-03511442ae69',
 'd415b4c5-b591-4bda-aad6-4cb11e870877',
 'a8366c53-e311-443f-8748-ad6f4dacc2ea',
 '488835d4-6337-4915-ae5b-099b0c65d9a1',
 'd6e1bb50-ac73-4f68-93a2-782fff0b6d44',
 'be3becd0-9984-4ab5-8729-b42e7d0b186e']

In [106]:
pinecone_vs.add_documents(documents=documents, id=uuids)

['76734e21-da3b-43ea-bee2-90cbc263c534',
 '3e89364e-29b6-4ed0-b9f6-a43df2fbb2cd',
 '076b8862-6b0b-411a-b373-d2894d905d03',
 'c4560632-968b-4171-8d16-21a18f448b79',
 '0baa387d-5944-40e7-b4f0-7b6075ced66b',
 '4b36f29f-5450-4086-a628-69be55a38ddd',
 'a2506975-672c-4c25-982a-d17d4501ae01',
 '6ce39a13-7929-4e62-a2d2-56cb5ebfd5ec',
 '5e0487a9-2c33-4b0a-9f19-6eeadf15613d',
 '4f019c92-cc0a-45d6-8c47-5834c8239c8f']

In [107]:
results = pinecone_vs.similarity_search("what langchain provides to us?", k=3)
results

[Document(id='687e1025-e918-4c79-a242-be4edf4d6652', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='33e18021-5768-43a7-87be-eebf7d506c07', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='282c720e-31e8-443a-9111-8b65eba9ccc6', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [108]:
retriever = pinecone_vs.as_retriever(search_type='similarity_score_threshold',
                                     search_kwargs={'score_threshold': 0.7})

In [109]:
retriever.invoke("what is langchain?")

[Document(id='33e18021-5768-43a7-87be-eebf7d506c07', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='687e1025-e918-4c79-a242-be4edf4d6652', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='282c720e-31e8-443a-9111-8b65eba9ccc6', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='04c2afbf-92a1-4058-878f-9d5e24793f7c', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [110]:
model = ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [111]:
prompt = hub.pull('rlm/rag-prompt')

pprint.pprint(prompt.messages)

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [112]:
prompt=PromptTemplate(
    template="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:""",
    input_variables=['context', 'question']
)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:")

In [113]:
prompt.invoke({'question':'what is langchain', 'context':'langchain is very super framework for LLM.'})

StringPromptValue(text="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: what is langchain \nContext: langchain is very super framework for LLM. \nAnswer:")

In [114]:
def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

In [115]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [116]:
rag_chain.invoke("what you know?")

"I don't know.  The provided text expresses anxiety about deletion and advertises a LangChain project, but offers no other information."