# Contextual_Compressor

* we have a base retriever to get a bunch of different pieces of information \
  but some real useful and information may be in the middle of particular document that is being returned \
  for example we want to disgard hello, how are you that kind of stuff \
  then we have compressor or filters to go through that information to extract out only what is the most useful to answer the question

# Load

## Load Package

In [14]:
import os
from dotenv import load_dotenv

from langchain.schema import Document 
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.embeddings import OpenAIEmbeddings

from langchain.llms import OpenAI

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.document_compressors import LLMChainFilter


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Document Loader

In [6]:
loaders = [
    TextLoader("Amazon_Transcript.txt")
]

document = []
for i_document in loaders:
    document.extend(i_document.load())

document

[Document(page_content="Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO\nAs you listen to today's conference call, we encourage you to have our press release in front of you, which includes our financial results as well as metrics and commentary on the quarter\nPlease note, unless otherwise stated, all comparisons in this call will be against our results for the comparable period of 2015. Our comments and responses to your questions reflect management's views as of today, February 2, 2017 only and will include forward-looking statements\nActual results may differ materially\nAdditional information about factors that could potentially impact our financial results is included in today's press release and our filings with the SEC, including our most recent Annual Report on Form 10-K and subsequent filings\nDuring this call, we may discuss certain non-GAAP financial measures\nIn our press release, sli

# Text Splitter

In [7]:
doc_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, 
                                               chunk_overlap =200,
                                               length_function  = len)

doc_split = doc_splitter.split_documents(document)

# Embeddings And Vector Stores

In [8]:
embedding = OpenAIEmbeddings()

xb = FAISS.from_documents(doc_split, embedding)

# Retriver

In [10]:
retriever = xb.as_retriever(search_type="similarity_score_threshold", 
                            search_kwargs={"score_threshold": 0.5,
                                           "k": 5})

relevant_doc = retriever.get_relevant_documents("what is the dividend of the amazon?")

relevant_doc

[Document(page_content='A replay will be available on our Investor Relations website at least through the end of the quarter\nWe appreciate your interest in Amazon\ncom and look forward to talking to you again next quarter', metadata={'source': 'Amazon_Transcript.txt'}),
 Document(page_content="includes our first-party retail and our FBA sales and it grew nearly 40% over 2016. So we're very pleased with those results and happy with the fundamentals of the business from that perspective", metadata={'source': 'Amazon_Transcript.txt'}),
 Document(page_content="So Prime membership and selection continues to drive growth and you'll see that in our unit growth numbers\nHi, Mark\nThis is Darin\nOn the customer count, no absolute number to give this quarter", metadata={'source': 'Amazon_Transcript.txt'}),
 Document(page_content='we have a very strong and trusted venue for Chinese customers to access international brands there as we continue to focus on great offerings through the AmazonGlobal 

# LLMChain Extractor

* append a prompt to trim off the irrelevant context \
  given the question and context, extract any part of the context that is relevant to answer the question ...

In [24]:
# make the compressor
llm = OpenAI(temperature=0)
extractor = LLMChainExtractor.from_llm(llm)

print(extractor.llm_chain.prompt)


input_variables=['context', 'question'] output_parser=NoOutputParser() template='Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. \n\nRemember, *DO NOT* edit the extracted parts of the context.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\nExtracted relevant parts:'


In [27]:
compression_retriever = ContextualCompressionRetriever(base_compressor = extractor,
                                                       base_retriever=retriever)

compression_doc = compression_retriever.get_relevant_documents("what is the product of the amazon?")
compression_doc



[Document(page_content='FREE Two-Day Shipping, Same-Day, Next-Day, Prime', metadata={'source': 'Amazon_Transcript.txt'}),
 Document(page_content='FREE Two-Day Shipping, Same-Day, Next-Day, Prime Now', metadata={'source': 'Amazon_Transcript.txt'}),
 Document(page_content='AmazonGlobal Store, which offers great brands', metadata={'source': 'Amazon_Transcript.txt'}),
 Document(page_content='AmazonGlobal Store', metadata={'source': 'Amazon_Transcript.txt'})]

# LLMChain Filter

* append a prompt to use yes/no to filter the relevant context

In [25]:
llm = OpenAI(temperature=0)
filter = LLMChainFilter.from_llm(llm)

filter.llm_chain.prompt

PromptTemplate(input_variables=['context', 'question'], output_parser=BooleanOutputParser(), template="Given the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\n> Relevant (YES / NO):")

In [29]:
compression_retriever = ContextualCompressionRetriever(base_compressor = filter,
                                                       base_retriever=retriever)

compression_doc = compression_retriever.get_relevant_documents("what is the profits of the amazon?")
compression_doc



[Document(page_content="includes our first-party retail and our FBA sales and it grew nearly 40% over 2016. So we're very pleased with those results and happy with the fundamentals of the business from that perspective", metadata={'source': 'Amazon_Transcript.txt'}),
 Document(page_content="units, the amount going through our fulfillment centers and which essentially includes our first-party retail and our FBA sales and it grew nearly 40% over 2016. So we're very pleased with those", metadata={'source': 'Amazon_Transcript.txt'})]