# Creating a Knowledge Base (the index)

install dependencies

In [38]:
# !pip install langchain langchain-openai langchain-community faiss-cpu tiktoken

import libraries

In [64]:
from langchain_openai import OpenAI
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.vectorstores import VectorStoreRetriever
import os

Set API Keys

In [65]:
os.environ["OPENAI_API_KEY"] = "sk-"

## Text Loaders

In [66]:
loader = TextLoader("./transcript.txt")
documents = loader.load()

## Text Splitters

In [67]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap=0,
    length_function=len,
)

docs = text_splitter.split_documents(documents)

In [69]:
docs = text_splitter.split_documents(documents)

## Create Embeddings

In [71]:
embedding_model = OpenAIEmbeddings()
index = FAISS.from_documents(docs, embedding_model)

## Query Documents

In [72]:
queries = [
    "What is RAG?",
    "What are the limitations in retrieval?"
    "What are the benefits of using RAG?"
    "What is a RAG pipeline?"
    "What happens if I ask an irrelevant question to my RAG pipeline?"
]

query = queries[0]

## Naive Similary Search in Vector Store

In [76]:
similar_documents = index.similarity_search(query)

In [77]:
similar_documents

[Document(metadata={'source': './transcript.txt'}, page_content="In this video, I am going to give a general overview of RAG, that's Retrieval Augmented Generation."),
 Document(metadata={'source': './transcript.txt'}, page_content='I will also go over some of the use cases for implementing a RAG pipeline into chatting with your documents.\nSo you can think of RAG as an advanced search engine for searching through your documents.'),
 Document(metadata={'source': './transcript.txt'}, page_content='This is your indexing component of your rag pipeline.\nThis is your retriever component of your rag pipeline.\nAnd then you have your generator.\nThese are the three components to your rag pipeline.'),
 Document(metadata={'source': './transcript.txt'}, page_content="And a RAG pipeline allows you to customize what pieces of those documents are provided into the LLM so that you can reduce your your token count that you're providing and reduce your cost.")]

Show all documents and scores

In [78]:
docs_and_scores = index.similarity_search_with_score(query)

In [79]:
docs_and_scores

[(Document(metadata={'source': './transcript.txt'}, page_content="In this video, I am going to give a general overview of RAG, that's Retrieval Augmented Generation."),
  0.2460782),
 (Document(metadata={'source': './transcript.txt'}, page_content='I will also go over some of the use cases for implementing a RAG pipeline into chatting with your documents.\nSo you can think of RAG as an advanced search engine for searching through your documents.'),
  0.30089223),
 (Document(metadata={'source': './transcript.txt'}, page_content='This is your indexing component of your rag pipeline.\nThis is your retriever component of your rag pipeline.\nAnd then you have your generator.\nThese are the three components to your rag pipeline.'),
  0.36235282),
 (Document(metadata={'source': './transcript.txt'}, page_content="And a RAG pipeline allows you to customize what pieces of those documents are provided into the LLM so that you can reduce your your token count that you're providing and reduce your 

## Save Index

In [80]:
index.save_local("faiss_index")

## Load Index

In [81]:
saved_index = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

## Build a Retriever

In [82]:
retriever = saved_index.as_retriever()

In [83]:
chunks = retriever.invoke(query)

In [89]:
from langchain.chains import RetrievalQA

In [90]:
rag_chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

In [106]:
queries = [
    "What is RAG?",
    "What are the limitations in retrieval?"
    "What are the benefits of using RAG?"
    "What is a RAG pipeline?"
    "What happens if I ask an irrelevant question to my RAG pipeline?"
]

query = queries[1]

In [107]:
answer = rag_chain.invoke(query)

In [108]:
answer['result']

' \n\nThe limitations in retrieval are that it can be time-consuming and may not always provide accurate or relevant answers. The benefits of using RAG include the ability to implement more advanced components into the retrieval process, scalability, and the ability to limit the documents being searched. A RAG pipeline is a method of using RAG (Retrieval-Augmented Generation) to search through documents and provide factual answers. If you ask an irrelevant question to your RAG pipeline, it may not provide an accurate or relevant answer.'