In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -q langchain-google-genai
!pip install --upgrade -q langchain-google-genai
!pip show langchain-google-genai
!pip install -q google-generativeai
!pip install -q python-dotenv
! pip install -q langchain_community tiktoken langchainhub chromadb langchain langsmith

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.0/413.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.12 requires async-timeout<5.0.0,>=4.0.0; python_version < "3.11", but you have async-timeout 5.0.1 which is incompatible.[0m[31m
[0mName: langchain-google-genai
Version: 2.0.9
Summary: An integration package connecting Google's genai package and LangChain
Home-page: https://github.com/langchain-ai/langchain-google
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: filetype, google-generativeai, langchain-core, pydantic
Required-by: 
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [3]:
from kaggle_secrets import UserSecretsClient
os.environ['GOOGLE_API_KEY'] = UserSecretsClient().get_secret('GOOGLE_API_KEY')
os.environ['LANGCHAIN_API_KEY'] = UserSecretsClient().get_secret('LANGSMITH_API_KEY')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "pr-prickly-example-65"

bs4 (BeautifulSoup): Helps parse and extract specific parts of a webpage.
langchain.hub: Provides access to reusable components in LangChain.
RecursiveCharacterTextSplitter: Splits large text into smaller chunks so it's easier to work with.
WebBaseLoader: Loads data from a webpage.
Chroma: A vector database for storing and retrieving embeddings (numerical representations of text).
StrOutputParser: Converts outputs into plain text.
RunnablePassthrough: Passes data through a pipeline without modification.
ChatOpenAI & OpenAIEmbeddings: Handles OpenAI's chatbot and text embeddings.

In [4]:
import os
from kaggle_secrets import UserSecretsClient
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.tracers import LangChainTracer

Loading Documents from a Website:

WebBaseLoader fetches content from the given blog post URL:
👉 Lilian Weng’s blog on AI Agents
bs_kwargs tells BeautifulSoup to extract only certain parts of the webpage:
post-content: The main body of the blog.
post-title: The title of the blog.
post-header: Any headers at the top of the blog.
docs = loader.load()

This actually downloads and extracts the relevant text from the blog post.
The result (docs) is a structured list of text chunks.

In [5]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
)
docs = loader.load()

Step 1: Splitting the Text for Better Processing

🔹 Why do this?
Long documents can be hard for language models to handle all at once, so we break them into smaller chunks.

chunk_size=1000: Each chunk contains 1000 characters.
chunk_overlap=200: Each chunk overlaps the next one by 200 characters to keep context intact.
split_documents(docs): Actually splits the loaded blog text into these chunks.
👉 Now we have smaller, manageable pieces of text that can be efficiently searched and retrieved.


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

Creating a Vector Database

🔹 Why do this?
Raw text is hard to search directly, so we convert it into numerical representations (embeddings).

GoogleGenerativeAIEmbeddings(): Uses gemini model to generate vector representations of each text chunk.
Chroma.from_documents(...): Stores these embeddings in a Chroma vector database.
👉 Now we have a searchable database, where each chunk of text is represented as a mathematical vector.

retriever = vectorstore.as_retriever()
🔹 What does this do?

Converts the vector database into a retriever, meaning we can now search for relevant text chunks by providing a query.
👉 Now, if we ask a question, the retriever will find the most relevant pieces of text from our stored embeddings.

In [7]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
retriever = vectorstore.as_retriever()

In [8]:
from langchain.prompts import PromptTemplate
#### RETRIEVAL and GENERATION ####

# Prompt
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="You are an AI assistant. Use the following context to answer the question:\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"
)


In [9]:
#tracer
tracer = LangChainTracer()

# LLM
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [10]:
from langsmith import traceable

@traceable  # This enables LangSmith tracking
def run_rag():
    return rag_chain.invoke("What is Task Decomposition?")

response = run_rag()
print(response)

Task decomposition is the process of breaking down a complex task into smaller, more manageable steps. This can be done by the LLM with simple prompting, by using task-specific instructions, or with human inputs.


In [11]:
import langsmith

print("LangSmith tracing is enabled:", os.getenv("LANGCHAIN_TRACING_V2"))
print("LangSmith project:", os.getenv("LANGCHAIN_PROJECT"))
print("LangSmith API Key set:", bool(os.getenv("LANGCHAIN_API_KEY")))


LangSmith tracing is enabled: true
LangSmith project: pr-prickly-example-65
LangSmith API Key set: True
