In [1]:
!pip -q install langchain huggingface_hub openai chromadb tiktoken faiss-cpu
!pip -q install sentence_transformers
!pip -q install -U FlagEmbedding

In [2]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [3]:

from langchain.vectorstores import FAISS

from langchain.schema import Document
from langchain.vectorstores import Chroma

## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import TextLoader

from langchain.embeddings import OpenAIEmbeddings


## BGE Embeddings

In [4]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

## Data prep


In [5]:
loaders = [
    TextLoader('tamil.txt'),
]
docs = []
for l in loaders:
    docs.extend(l.load())

In [6]:
len(docs)

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(docs)

In [8]:
# Helper function for printing docs

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [9]:
retriever = FAISS.from_documents(texts,
                                 bge_embeddings
                                #  OpenAIEmbeddings()
                                 ).as_retriever()

docs = retriever.get_relevant_documents("What is tamil?")
#lets look at the docs
pretty_print_docs(docs)

## Adding contextual compression with an LLMChainExtractor

Now let's wrap our base retriever with a ContextualCompressionRetriever. We'll add an LLMChainExtractor, which will iterate over the initially returned documents and extract from each only the content that is relevant to the query.

In [10]:
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# making the compressor
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

# it needs a base retriever (we're using FAISS Retriever) and a compressor (Made above)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,
                                                       base_retriever=retriever)

In [11]:
# compressor prompt
compressor.llm_chain.prompt

In [12]:
compressed_docs = compression_retriever.get_relevant_documents("What is tamil?")
pretty_print_docs(compressed_docs)

## More built-in compressors: filters

### LLMChainFilter

Uses an LLM chain to select out the queries to show the final LLM - This could be shown to a model fine tuned to do this

"YES" we show it or "NO" we don't show it

In [13]:
from langchain.retrievers.document_compressors import LLMChainFilter

_filter = LLMChainFilter.from_llm(llm)


In [14]:
_filter.llm_chain.prompt

In [15]:
compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What is tamil")
pretty_print_docs(compressed_docs)

### EmbeddingsFilter
Use an Embedding model to filter out the results that are closest to the query

In [16]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings = OpenAIEmbeddings()
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What is tamil")
pretty_print_docs(compressed_docs)

## Pipelines


### Stringing compressors and document transformers together

DocumentCompressorPipeline allows us to string things together.

BaseDocumentTransformers - can do transformations on the docs -eg. split the text and

EmbeddingsRedundantFilter - filter out what is not related after a split or transformation



In [17]:
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")

redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)

## making the pipeline
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)

In [18]:
compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
                                                       base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What is tamil")
pretty_print_docs(compressed_docs)

In [19]:
### different pipeline

## making the pipeline
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, compressor, redundant_filter, relevant_filter]
)

In [20]:
compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
                                                       base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What is tamil")
pretty_print_docs(compressed_docs)

## Examples Pipelines

**Example 1** - filter, rewrite, check with embeddings

**Example 2** - retrieve multiple sources [ensemble], filter, rewrite,

**Example 3** - retrieve, split, check splits with embeddings, filter, rewrite,