# Contextual compression

In [4]:
from pinecone import Pinecone
import os
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.chain_extract import LLMChainExtractor
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore

# create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-4o-mini")

# Connect database
pc = Pinecone(embedding=embedding, api_key="{YOUR_PINECONE_APIKEY}")
index = pc.Index("terry-wiki")

vectordb = PineconeVectorStore(index=index, embedding=embedding)

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vectordb.as_retriever()
)

query = "Where is the locaction for vacation?"
docs = compression_retriever.invoke(query, k=2)
for doc in docs:
    print(doc)
    print("\n")

# Compression with LLMChainFilter

In [7]:
from pinecone import Pinecone
import os
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore

# create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-4o-mini")

# Connect database
pc = Pinecone(embedding=embedding, api_key="{YOUR_PINECONE_APIKEY}")
index = pc.Index("terry-wiki")
vectordb = PineconeVectorStore(index=index, embedding=embedding)

filter = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=filter, base_retriever=vectordb.as_retriever(), k=2
)

query = "Where is the locaction for vacation?"
docs = compression_retriever.invoke(query, k=2)
for doc in docs:
    print(doc)
    print("\n")

page_content='Austria (, ;  ), officially the Republic of Austria ( ), is a country in Central Europe. Around Austria there are the countries of Germany, Czech Republic, Slovakia, Hungary, Slovenia, Italy, Switzerland, and Liechtenstein. Currently, the chancellor is Karl Nehammer The previous chancellor was Alexander Schallenberg (2021). Austria has been a member-state of the United Nations since 1955 the European Union since 1995 and OPEC since 2019.

The people in Austria speak German, a few also speak Hungarian, Slovenian and Croatian. The capital of Austria is Vienna (Wien).

Austria is more than a thousand years old. Its history can be followed to the ninth century. At that time the first people moved to the land now known as Austria. The name "Ostarrichi" is first written in an official document from 996. Since then this word has developed into the Modern German word Österreich, which literally means "East Empire."

Politics 
Austria is a democratic republic. It is a neutral stat

# Compression with pipeline

In [8]:
from pinecone import Pinecone
import os
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter
from langchain.retrievers.document_compressors.chain_extract import LLMChainExtractor
from langchain.retrievers.document_compressors.base import DocumentCompressorPipeline
from langchain_community.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter

# create embedding API
# os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
llm = ChatOpenAI(model="gpt-4o-mini")
embedding = OpenAIEmbeddings()

# Connect database
pc = Pinecone(embedding=embedding, api_key="{YOUR_PINECONE_APIKEY}")
index = pc.Index("terry-wiki")
vectordb = PineconeVectorStore(index=index, embedding=embedding)

llm_filter = LLMChainFilter.from_llm(llm)
llm_extractor = LLMChainExtractor.from_llm(llm)
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding)


pipeline_compressor = DocumentCompressorPipeline(
    transformers=[redundant_filter, llm_extractor, llm_filter]
)
# query ="Where is the cuba? and nearest country by the Cuba?" 이고 k=3
# redundant_filter 사용하면 결과가 Cuba에 대한것 하나만 나옴

# query ="Where is the cuba? and nearest country by the Cuba?" 이고 k=3
# redundant_filter 없이 사용하면 결과가 Cuba에 대한것 3개가 같은 레코드 나옴

# pipeline_compressor = DocumentCompressorPipeline( transformers=[llm_extractor])

compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=vectordb.as_retriever(), k=10
)

# query = "Where is the best place for summer vacation?"
query = "Where is the cuba? and nearest country by the Cuba?"
docs = compression_retriever.get_relevant_documents(query)
for doc in docs:
    print(doc)
    print("\n")

page_content='Cuba is an island country in the Caribbean Sea. The country is made up of the big island of Cuba, the Isla de la Juventud island (Isle of Youth), and many smaller islands. Havana is the capital of Cuba. It is near the United States, Mexico, Haiti, Jamaica and the Bahamas.' metadata={'chunk': 2.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}




In [11]:
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectordb.as_retriever()
)
qa_with_sources.invoke({"question": query})


{'question': 'Where is the cuba? and nearest country by the Cuba?',
 'answer': 'Cuba is an island country located in the Caribbean Sea. It consists of the main island of Cuba, the Isla de la Juventud (Isle of Youth), and numerous smaller islands. The nearest countries to Cuba include the United States, Mexico, Haiti, Jamaica, and the Bahamas.\n\n',
 'sources': 'https://simple.wikipedia.org/wiki/Cuba'}