In [None]:
# In general in any RAG application four components will be there
# Those are 
# Document Loader
# Text Splitter
# Vector Database
# Retriever

### Document Loaders

Some popular Document loaders are
1. Text Document Loader
2. PDF Document Loader
3. CSV Document Loader
4. Directory Document Loader
5. Webbase Document Loader

In [None]:
# Document Loaders
from langchain_community.document_loaders.text import TextLoader

# Text document Loader
text_loader = TextLoader(file_path='file.txt')

text_docs = text_loader.load()

text_docs

In [None]:
from langchain_community.document_loaders.pdf import PyPDFLoader

# PDF loader
pdf_loader = PyPDFLoader(file_path="Atomic habits ( PDFDrive ).pdf")

pdf_docs = pdf_loader.load()

pdf_docs

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

# CSV Loader
csv_loader = CSVLoader(file_path= 'diabetes.csv')

csv_docs = csv_loader.load()

csv_docs

In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.text import TextLoader

# Directory Loader
directory_loader = DirectoryLoader(path='text_files',
                                   glob= '*.txt',
                                   loader_cls= TextLoader,
                                   show_progress= True)

directory_docs = directory_loader.load()

directory_docs

In [None]:
from langchain_community.document_loaders import WebBaseLoader

# WebBase Loader
webbase_loader = WebBaseLoader(web_path= "https://www.google.com/search?q=what+is+machine+learning&sourceid=chrome&ie=UTF-8")

web_docs = webbase_loader.load()

web_docs

### Text Splitters
Types of text splitter:
1. Length Based text Splitting (Character Splitter)
2. Text Structure based text splitting (Recursive Character text splitter)
3. Document Structure based text splitting (Different language formats like python, js, markdown etc.)
4. Semantic Meaning based text splitting (Still in experimental stage)

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader

text_loader = TextLoader(file_path= 'file.txt')
text_docs = text_loader.load()

# Character Text Splitter
char_splitter = CharacterTextSplitter(
    separator= ' ',
    chunk_size= 100,
    chunk_overlap = 10
)

char_split_docs = char_splitter.split_documents(text_docs)

char_split_docs

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader

text_loader = TextLoader(file_path= 'file.txt')
text_docs = text_loader.load()

# Recursive Character Text Splitter
rec_char_splitter = RecursiveCharacterTextSplitter(
    chunk_size= 100,
    chunk_overlap = 10
)

rec_char_split_docs = rec_char_splitter.split_documents(text_docs)

rec_char_split_docs

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""

# Document Structure based text splitting
python_splitter = RecursiveCharacterTextSplitter.from_language(  # Here you can choose any language based on document like python, js, markdown etc.
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)

python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

### Vectore Store or Databases

Popular Vector Database:
1. FAISS
2. Chroma DB
3. Pinecone
4. Weviate

In [None]:
# First define one embedding model
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001"
)

vector = embeddings.embed_documents(["Hello My name is abhijeet"])

vector

In [None]:
!pip install onnxruntime

In [None]:
from langchain_chroma import Chroma
from langchain_core.documents import Document

# Chroma DB
vector_store = Chroma(
    embedding_function=embeddings,
    persist_directory='chroma_db',
    collection_name='random_data'
    
)

docs = [
    Document(
        page_content= "Apple macbook is capable of running GTA 5 game smoothly",
        metadata= {'source': 'Apple Inc.'}
    ),
    Document(
        page_content="Atoms are the most basic thing of every element. Every objects are made up of Atoms.",
        metadata= {'source': 'chemistry knowledge'}
    ),
    Document(
        page_content= "India is a beautiful country with diversified cultures.",
        metadata= {'source': 'story about India'}
    ),
    Document(
        page_content = "Ram brought Apple from shopping mall but it was bitter in taste.",
        metadata= {'source': 'Lifestyle'}
    ),
    Document(
        page_content= "NVIDIA launches it's new GPU which is 10x faster then the previous GPUs.",
        metadata = {'source': 'Tech Thing'}
    ),
    Document(
        page_content= "Radio active elements emits radition and decay after sometime.",
        metadata= {'source': 'Radiation'}
    ),
    Document(
        page_content= "Spices of India are very popular for it's unique taste and rich in fibers.",
        metadata = {'source': 'Taste of India'}
        
    )
]

vector_store.add_documents(docs)
vector_store.get(include=['documents', 'metadatas'])
vector_store.similarity_search(query= "I bought a new gaming laptop", k=2)

In [None]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

# FAISS Vector store
index = faiss.IndexFlatL2(len(embeddings.embed_query('Hello World!')))

vector_store = FAISS(
    embedding_function=embeddings,
    docstore= InMemoryDocstore(),
    index= index,
    index_to_docstore_id= {}
)

docs = [
    Document(
        page_content= "Apple macbook is capable of running GTA 5 game smoothly",
        metadata= {'source': 'Apple Inc.'}
    ),
    Document(
        page_content="Atoms are the most basic thing of every element. Every objects are made up of Atoms.",
        metadata= {'source': 'chemistry knowledge'}
    ),
    Document(
        page_content= "India is a beautiful country with diversified cultures.",
        metadata= {'source': 'story about India'}
    ),
    Document(
        page_content = "Ram brought Apple from shopping mall but it was bitter in taste.",
        metadata= {'source': 'Lifestyle'}
    ),
    Document(
        page_content= "NVIDIA launches it's new GPU which is 10x faster then the previous GPUs.",
        metadata = {'source': 'Tech Thing'}
    ),
    Document(
        page_content= "Radio active elements emits radition and decay after sometime.",
        metadata= {'source': 'Radiation'}
    ),
    Document(
        page_content= "Spices of India are very popular for it's unique taste and rich in fibers.",
        metadata = {'source': 'Taste of India'}
        
    )
]

vector_store.add_documents(documents=docs)
vector_store.similarity_search("I bought a new gaming laptop", k=2)

In [None]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document

load_dotenv()

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

index_name = "langchain-test-index"  # change if desired

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

# Pinecone Vector Store
pinecone_vector_store = PineconeVectorStore(
    index = index,
    embedding= embeddings
)

docs = [
    Document(
        page_content= "Apple macbook is capable of running GTA 5 game smoothly",
        metadata= {'source': 'Apple Inc.'}
    ),
    Document(
        page_content="Atoms are the most basic thing of every element. Every objects are made up of Atoms.",
        metadata= {'source': 'chemistry knowledge'}
    ),
    Document(
        page_content= "India is a beautiful country with diversified cultures.",
        metadata= {'source': 'story about India'}
    ),
    Document(
        page_content = "Ram brought Apple from shopping mall but it was bitter in taste.",
        metadata= {'source': 'Lifestyle'}
    ),
    Document(
        page_content= "NVIDIA launches it's new GPU which is 10x faster then the previous GPUs.",
        metadata = {'source': 'Tech Thing'}
    ),
    Document(
        page_content= "Radio active elements emits radition and decay after sometime.",
        metadata= {'source': 'Radiation'}
    ),
    Document(
        page_content= "Spices of India are very popular for it's unique taste and rich in fibers.",
        metadata = {'source': 'Taste of India'}
        
    )
]

pinecone_vector_store.add_documents(docs)

In [None]:
pinecone_vector_store.similarity_search('Acid and base reaction gives us neutral solution.', k=2)

### Retrievers
1. Vector Retriever
2. Maximal Marginal Relevance (MMR)
3. Multi Query Retriever (MQR)
4. Context Compression Retriever (CCR)

In [None]:
# Vector Retriever
retriever = pinecone_vector_store.as_retriever(search_kwargs= {'k':3})

response = retriever.invoke('India is a good country.')

response

In [None]:
# MMR Retriever
mmr_retriever = pinecone_vector_store.as_retriever(
    search_type = 'mmr',
    search_kwargs= {'k': 2, 'lambda_mult': 1} # 0-1, 0 = fully diverse result, 1 = similar as normal retriever
)

response = mmr_retriever.invoke('What is acid reaction.')

response

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
gemini_model = ChatGoogleGenerativeAI(model = 'gemini-1.5-flash')

# Multi Query Retriver
mqr_retriever = MultiQueryRetriever.from_llm(
    llm= gemini_model,
    retriever=pinecone_vector_store.as_retriever(search_kwargs= {'k':3})
)

response = mqr_retriever.invoke('Yesterday I bought pomogranate from nearest fruit shop.')

response

In [None]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.chain_extract import LLMChainExtractor

# Context Compression Retriever
ccr_retriever = ContextualCompressionRetriever(
    base_retriever= pinecone_vector_store.as_retriever(search_kwargs= {'k': 2}),
    base_compressor= LLMChainExtractor.from_llm(gemini_model)
)

response = ccr_retriever.invoke('I bought Apple Iphone')

response