# RAG Common Setup

Config common to all RAG examples is configured here.

## Packages

In [5]:
! pip install -qU jupyter-contrib-nbextensions pickleshare

! pip install -qU langchain_aws langchain_community tiktoken langchain chromadb langchain-chroma
 

## LangSmith

Setting the following LangSmith environment variables allows the use of [LangSmith tracing](https://smith.langchain.com/). To use this you need a LangSmith API key (requires a free account creating). This is optional.

In [4]:
import getpass
import os

def _set_env(key: str):
    if key not in os.environ:
        os.environ[key] = getpass.getpass(f"{key}:")


In [3]:
import os

try:
    enable_langsmith
except NameError:
    enable_langsmith = input('Enabled Langsmith tracing? (y/n): ').lower().strip() == 'y'
    %store enable_langsmith

if (enable_langsmith=='y'):
    os.environ['LANGCHAIN_TRACING_V2'] = 'true'
    os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
    _set_env("LANGCHAIN_API_KEY")
else:
    os.environ['LANGCHAIN_TRACING_V2'] = 'false'

## AWS Credentials

In [8]:
import os
_set_env("AWS_ACCESS_KEY_ID")
_set_env("AWS_SECRET_ACCESS_KEY")
_set_env("AWS_SESSION_TOKEN")
os.environ["AWS_REGION"] = 'us-west-2'

## LLMs

In [9]:
from langchain_aws import BedrockEmbeddings, ChatBedrock
import os

embeddings = BedrockEmbeddings()

llm = ChatBedrock(
    aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    aws_session_token=os.environ["AWS_SESSION_TOKEN"], 
    region_name=os.environ["AWS_REGION"],
    model_id="anthropic.claude-3-5-sonnet-20241022-v2:0",
    model_kwargs={"temperature": 0}
)

## Shared functions

Declare functions to load data from a blog, split it, then load the split data into a vectorstore.

In [10]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from typing import Iterator
import bs4

def load_sample_data() -> Iterator[Document]:
    """Loads data from a blog, intended to be later stored in a vectorstore."""

    loader = WebBaseLoader(
        web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=("post-content", "post-title", "post-header")
            )
        ),
    )
    docs = loader.load()
    return docs

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List

def split_sample_data(docs:Iterator[Document], chunk_size=300, chunk_overlap=50) -> List[Document]:
    """Splits the text"""
    
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap)

    # Make splits
    splits = text_splitter.split_documents(docs)
    return splits

In [7]:
from langchain_chroma import Chroma
from langchain_core.vectorstores import VectorStoreRetriever
from uuid import uuid4

def seed_sample_data(documents:List[Document], k=1) -> VectorStoreRetriever: 
    """Creates and seeds a vectorstore"""
    
    vector_store = Chroma(
        collection_name="rag_techniques",
        embedding_function=embeddings,
        persist_directory="./chroma_db",
    )

    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents=documents, ids=uuids)
    
    retriever = vector_store.as_retriever(search_kwargs={"k": k})
    return retriever

[Tiktoken](https://github.com/openai/tiktoken/blob/main/README.md) is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) open-source tokenizer by OpenAI. Given a text string (e.g., `tiktoken is great!`) and an encoding (e.g., `cl100k_base`), a tokenizer can split the text string into a list of tokens (e.g., [`t`, `ik`, `token`, `is`, `great`, `!`]). Splitting text strings into tokens is useful because GPT models see text in the form of tokens. Knowing how many tokens are in a text string can tell you (a) whether the string is too long for a text model to process and (b) how much an OpenAI API call costs (as usage is priced by token).

In [8]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
def write_results(filename, result):
    """Write results to disk for later analysis"""
    
    os.makedirs("../Z - results", exist_ok=True)

    with open("../Z - results/" + filename, "w") as f:
        f.write(result)