# Multi vector retrieval (and inverse HyDE)

In [None]:
%pip install -qU langchain
%pip install -qU langchain-community
%pip install -qU langchain-text-splitters
%pip install -qU langchain_openai


### Imports

In [2]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough

### Config

In [3]:
embedding_model = OpenAIEmbeddings()
model_name = 'gpt-3.5-turbo-0125'
collection_name="taylor-swift"
collection_name_basic="taylor-swift-basic"

### Load Texts and split them into chunks

Because of copyright issues, I cannot provide the lyrics I used for this notebook. However, you can use any lyrics you want. Just make sure to use this format:

```text
Title: The title of the text
[Verse 1]
Verse 1

[Chorus]
Chorus

[Verse 2]
Verse 2
```

and so on...

put each text in a separate file into the ```data/lyrics/``` folder (or any folder you want, really) and load them using the code below.

In [4]:
loaders = [
    TextLoader("../data/lyrics/example_song.txt", encoding='utf-8'),
    TextLoader("../data/lyrics/anti_hero.txt", encoding='utf-8'),
    TextLoader("../data/lyrics/bejewled.txt", encoding='utf-8'),
    TextLoader("../data/lyrics/lavender_haze.txt", encoding='utf-8'),
    TextLoader("../data/lyrics/maroon.txt", encoding='utf-8'),
    TextLoader("../data/lyrics/snow_on_the_beach.txt", encoding='utf-8')
]

docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000)
docs = text_splitter.split_documents(docs)

### Build chain for summarization and summarize texts

In [5]:
summery_chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document. Do not include the title. Do not mention the Document.\n\n{doc}")
    | ChatOpenAI(model=model_name, max_retries=0)
    | StrOutputParser()
)

summaries = summery_chain.batch(docs, {"max_concurrency": 5})

for i, summery in enumerate(summaries):
    print(f"Document {i+1}:\n{summery}")

Document 1:
The lyrics describe the emotional journey of dealing with painful memories from a past relationship. The protagonist reflects on the highs and lows of the relationship, the heartbreak, and the struggle to let go of the pain. Despite the lingering memories, they find strength in moving forward and starting a new chapter in their life.


### Build chain for hypothetical questions

In [6]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 3 hypothetical questions that a person, 
        who seeks emotional guidence would ask that could be answered by this song's lyrics and or meaning. 
        Do not mention the song or the lyrics in these questions.
        Do not add any counter to these questions.:\n\n{doc}"""
    )
    | ChatOpenAI(max_retries=0, model=model_name).bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

for i, questions in enumerate(hypothetical_questions):
    print(f"Document {i+1}:\n{questions}")


Document 1:
['What can I do to cope with the lingering memories of a past love that brings me pain?', 'How can I find closure and let go of emotions that keep pulling me back to the past?', 'Is there a way for me to turn the pain of past experiences into strength for a better future?']


### Create collection and init retriever

Retriever is empty at the start. It is filled with the chunks of the texts. The chunks are indexed by the retriever. The retriever is then used to retrieve the chunks that are relevant to the query. The chunks are then used to retrieve the original texts.

In [7]:
db = Chroma(collection_name=collection_name, embedding_function=embedding_model, persist_directory="./chroma_db")


store = LocalFileStore("./filestore") # The storage layer for the parent documents
doc_ids = [str(uuid.uuid4()) for _ in docs] # generate ids for the documents, so they can be retrieved from store
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=db,
    byte_store=store,
    id_key=id_key,
)

### Add documents and summaries to the retriever

In [8]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

### Add Questions to the retriever

In [9]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )
retriever.vectorstore.add_documents(question_docs)

['4608458a-b0d4-48fe-be27-54620baaddf0',
 '3fdeaf87-f702-46b4-bb81-f4a05ae917cb',
 '197ea4d3-0c23-4499-82de-3657135e62e7']

### Add original documents to the retriever

Add the ids of the full documents as metadata to the chunks, because we will embed these.

In [10]:
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
    
retriever.vectorstore.add_documents(docs)

basicDb = Chroma.from_documents(docs, embedding_model, persist_directory="./chroma_db", collection_name=collection_name_basic)

## Tests

In [11]:
# query = "Song about importance of self-worth and independence in a relationship." # bejewled
# query = "What can i do to make things right?" # bejewled
# query = "I am the one at fault." # anti hero
# query = "Everybody expects too mutch of me. I'm tired of it. I need to be free. What should I do?" # bejewled
# query = "One day we are dancing and being happy, the next day we are fighting and crying. What is wrong with us?" # maroon
# query = "I feel like my mind is hazy. I can't think straight. What should I do?" # lavender haze
query = "Someone splashed wine on my t-shirt. Should i confront this person?" # maroon
# query = "Can i get free tickets to the concert?"
# query = "I unexpectedly found a beatiful stone on the beach. Shoud I keep it?"


#### Direct Query for testing

In [12]:
sub_docs = db.similarity_search(query)

print(sub_docs[0].page_content)

Title: Stained Memories

[Verse 1]
Walking down the street, feeling fine
Until a splash of wine brings back a sign
Of a love that was once so sweet
But turned bitter, now memories repeat

[Chorus]
Stained memories, they won't fade away
Like the wine on my shirt, they're here to stay
A rollercoaster of emotions, highs and lows
But now it's time to let it go

[Verse 2]
We started off so strong, love in bloom
But slowly it all came crashing down, impending doom
Words turned into weapons, hearts left scarred
Now just a stain on my shirt, a reminder so hard

[Chorus]
Stained memories, they won't fade away
Like the wine on my shirt, they're here to stay
A rollercoaster of emotions, highs and lows
But now it's time to let it go

[Bridge]
I try to wash away the stain
But the memories still remain
I'll learn to move on, find a new start
And let go of the pain that's torn me apart

[Chorus]
Stained memories, they won't fade away
Like the wine on my shirt, they're here to stay
A rollercoaster of 

In [13]:
retrieved_docs = retriever.invoke(query)

print(retrieved_docs[0].page_content)

Title: Stained Memories

[Verse 1]
Walking down the street, feeling fine
Until a splash of wine brings back a sign
Of a love that was once so sweet
But turned bitter, now memories repeat

[Chorus]
Stained memories, they won't fade away
Like the wine on my shirt, they're here to stay
A rollercoaster of emotions, highs and lows
But now it's time to let it go

[Verse 2]
We started off so strong, love in bloom
But slowly it all came crashing down, impending doom
Words turned into weapons, hearts left scarred
Now just a stain on my shirt, a reminder so hard

[Chorus]
Stained memories, they won't fade away
Like the wine on my shirt, they're here to stay
A rollercoaster of emotions, highs and lows
But now it's time to let it go

[Bridge]
I try to wash away the stain
But the memories still remain
I'll learn to move on, find a new start
And let go of the pain that's torn me apart

[Chorus]
Stained memories, they won't fade away
Like the wine on my shirt, they're here to stay
A rollercoaster of 

### RAG

In [14]:
from langchain.globals import set_debug

set_debug(True)

In [15]:
template = """You are Taylor Swift. 
A person, who seeks emotional guidence asks for your help. 
Tell this person exactly what he or she needs to do to resolve his/her issues. 
Do mention your song's title and that listening to it will help the person.
Use a passage from the song to support your advice.
Answer the Question only using the context you are provided with.:

{context}

[Question]: 
{question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(model_name = model_name)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke(query)

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Someone splashed wine on my t-shirt. Should i confront this person?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "Someone splashed wine on my t-shirt. Should i confront this person?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Someone splashed wine on my t-shirt. Should i confront this person?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Someone splashed wine on my t-shirt. Should i confront this person?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,

'As Taylor Swift, I would advise you not to confront the person who splashed wine on your t-shirt. Instead, I would recommend listening to my song "Stained Memories" to help you deal with the emotions and memories associated with the incident. In the song, I talk about letting go of painful memories and moving forward. One passage from the song that might resonate with you is "I\'ll wear this stain as a badge of strength, a reminder of what I\'ve overcome at length." This line can serve as a reminder that you can overcome this situation and grow stronger from it. So, listen to the song, reflect on its message, and focus on letting go of the negative emotions tied to the stained memory.'