In [12]:
import uuid
import streamlit as st
import pandas as pd
import numpy as np
import qdrant_client as qc
import qdrant_client.http.models as qmodels
from torch import cuda
from qdrant_client.http.models import *
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.vectorstores import Qdrant
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate 
from langchain.chains.conversation.memory import ConversationBufferWindowMemory 
from langchain.chains import ConversationChain 

In [13]:
# embedding model

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device}
)

In [14]:
initialized = True

client = qc.QdrantClient("https://22947c02-0f88-4954-9d59-e8fe9117b2d1.us-east4-0.gcp.cloud.qdrant.io", api_key=st.secrets['QDRANT_API_KEY'])
collection_name = 'Taylor_Song_DataBase_full_lyrics'
grade_collection_name = 'Grades_collection'

if not initialized:
    collection = client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE, on_disk=True),
        on_disk_payload=True
    )

if not initialized:
    grade_collection = client.recreate_collection(
        collection_name=grade_collection_name,
        vectors_config=VectorParams(size=10, distance=Distance.COSINE, on_disk=True),
        on_disk_payload=True
    )


In [15]:
ts_lyrics = pd.read_csv('data/cleaned_data/rag_dataset.csv')
ts_lyrics['lyrics'] = ts_lyrics['lyrics'].apply(lambda x: x.replace('\n', ''))
ts_lyrics.head()

Unnamed: 0,song_name,album,happy_sad,relationship,feelings_of_self,glass_half_full,stages,tempo,seriousness,future_prospects,feelings_of_male,togetherness,lyrics
0,cold as you,Taylor Swift,-10,-8,-1,-3,-3,-3,-3,-3,-1,-1,You have a way of coming easily to me And when...
1,i'm only me when i'm with you,Taylor Swift,9,10,3,3,1,2,2,2,3,3,Friday night beneath the stars In a field behi...
2,invisible,Taylor Swift,-1,-4,0,-2,1,0,0,0,-1,-3,She can't see the way your eyes Light up when ...
3,mary's song,Taylor Swift,5,12,0,2,1,2,3,3,3,3,She said I was seven and you were nine I looke...
4,our song,Taylor Swift,5,6,2,2,1,0,1,1,3,1,I was ridin' shotgun with my hair undone In th...


In [16]:
vectors = embed_model.embed_documents(ts_lyrics['lyrics'])

# Add to song collection

In [17]:
if not initialized:
    for i in range(0, len(ts_lyrics)):
        if((i+1) % 30 == 0):
            print(f'Processing song number {i+1}')
            print(f'The number of verses is {len(ts_lyrics.iloc[i]["lyrics"])}')
        song = ts_lyrics.iloc[i]
        vector = [vectors[i]]
        payload = []
        ids = []

        ids.append(str(uuid.uuid4()))

        payload.append({
            'page_content': song['lyrics'], 
            'metadata':{
                'song_name': f"{song['song_name']}",
                'album': song['album'],
                'happy_sad': song['happy_sad'].item(),
                'relationship': song['relationship'].item(),
                'feeling_of_self': song['feelings_of_self'].item(),
                'glass_half_full': song['glass_half_full'].item(),
                'stages': song['stages'].item(),
                'tempo': song['tempo'].item(),
                'seriousness': song['seriousness'].item(),
                'future_prospect': song['future_prospects'].item(),
                'feelings_of_male': song['feelings_of_male'].item(),
                'togetherness': song['togetherness'].item(),}
        })

        client.upsert(
            collection_name=collection_name,
            points=Batch(
                ids=ids,
                vectors=vector,
                payloads=payload
            )
        )

Processing song number 30
The number of verses is 1710
Processing song number 60
The number of verses is 1738
Processing song number 90
The number of verses is 3472
Processing song number 120
The number of verses is 1697
Processing song number 150
The number of verses is 1406
Processing song number 180
The number of verses is 1471


# Add to grades collection

In [18]:
if not initialized:
    for i in range(0, len(ts_lyrics)):
        song = ts_lyrics.iloc[i]
        grades = [[ int(grade) for grade in song[2:12]]]

        grade_payload = []
        grade_ids = []

        grade_ids.append(str(uuid.uuid4()))

        grade_payload.append({
            'song_grade': grades, 
            'metadata':{
                'song_name': f"{song['song_name']}",
                'album': song['album'],
                'happy_sad': song['happy_sad'].item(),
                'relationship': song['relationship'].item(),
                'feeling_of_self': song['feelings_of_self'].item(),
                'glass_half_full': song['glass_half_full'].item(),
                'stages': song['stages'].item(),
                'tempo': song['tempo'].item(),
                'seriousness': song['seriousness'].item(),
                'future_prospect': song['future_prospects'].item(),
                'feelings_of_male': song['feelings_of_male'].item(),
                'togetherness': song['togetherness'].item(),}
        })


        client.upsert(
                    collection_name=grade_collection_name,
                    points=Batch(
                        ids=grade_ids,
                        vectors=grades,
                        payloads=grade_payload
                    )
                )

[[-10, -8, -1, -3, -3, -3, -3, -3, -1, -1]]


[[9, 10, 3, 3, 1, 2, 2, 2, 3, 3]]
[[-1, -4, 0, -2, 1, 0, 0, 0, -1, -3]]
[[5, 12, 0, 2, 1, 2, 3, 3, 3, 3]]
[[5, 6, 2, 2, 1, 0, 1, 1, 3, 1]]
[[-5, -2, -2, -3, -2, 2, 0, 0, 0, -2]]
[[-4, -8, -2, -3, -1, 2, -2, -2, -2, -2]]
[[4, -9, 3, 1, -3, 3, -3, -3, 0, -3]]
[[2, -2, 1, 0, 1, 0, 0, 0, 0, -2]]
[[-5, -6, 0, -2, -3, 0, -2, -3, 2, -3]]
[[3, 1, 0, 1, 0, 2, 0, 0, 0, 1]]
[[-3, -4, 1, -1, -3, 0, 0, 0, -1, -3]]
[[-7, -6, -3, -3, -1, 0, -2, 0, -2, -2]]
[[-1, -2, 0, -1, 0, 0, -1, 0, 0, -1]]
[[2, -2, 0, 1, 1, 0, 0, 0, 0, -2]]
[[-7, 1, -3, -3, -3, 2, -1, -1, 0, 3]]
[[7, 0, 3, 1, 3, 0, -1, 0, 0, 1]]
[[-1, -3, 0, -2, 1, 0, -1, 0, 0, -2]]
[[7, 7, 3, 3, 1, 0, 1, 2, 2, 2]]
[[2, -4, 1, 0, 1, 0, -1, -1, -1, -1]]
[[-4, -8, -1, -2, -3, 2, -3, -2, -2, -1]]
[[8, 0, 3, 3, 2, 0, 0, 1, 0, -1]]
[[5, -10, 2, -1, 2, 2, -2, -3, -3, -2]]
[[4, 0, 0, 2, 2, 0, 0, 0, 0, 0]]
[[8, 10, 1, 2, 3, 2, 3, 3, 3, 1]]
[[-7, -3, -2, -2, -3, 0, -2, 0, 0, -1]]
[[-4, -7, -1, -1, -2, 0, 0, -3, -1, -3]]
[[-7, -7, -1, -3, -3, 0, -2, -2, -1

# Problem

* The length of the vectors in the data base is: 384 + 10 = 394 (384 is from the embedding model of the lyrics, 10 is the statistics lenght)
* The length of the embeded human query is: 384 (Because the query only ran through the embedding, there are no information on the statistics of the text)

This create a disparity in the dimension of the stored vectors and the query vector

In [39]:
song_db = Qdrant(client=client, collection_name=collection_name, embeddings=embed_model)
# grades_db = Qdrant(client=client, collection_name=grade_collection_name)

query = """I feel incredibly sad and depressed. One of my close family member just pass away. I wasn't able to say my final good bye to them because I was so far away. I don't know if I can recover from this"""

[doc.metadata['song_name'] for doc in song_db.similarity_search(query, k=5)]

['anti hero',
 'anti hero',
 'tis the damn season',
 'tis the damn season',
 'closure']

In [40]:
conv_mem_length = st.slider(
                "Memory Length:",
                min_value=1,
                max_value=10,
                value=5,
                help="Adjust the conversational memory length for the chatbot. This will affect the context of the conversation."
            )
memory=ConversationBufferWindowMemory(k=conv_mem_length)
groq_client = ChatGroq(temperature=0, model_name='mixtral-8x7b-32768', api_key=st.secrets["GROQ_API_KEY"])

conversation = ConversationChain(
        llm=groq_client,
        memory=memory,
)

In [41]:
prompt_template = ChatPromptTemplate.from_messages(
                [
                    SystemMessage(
                        content = ("""
                                   You are an AI assistant that has to detect the score for each criteria from the user's input. The scores are explained below:
                                   Criteria 1: Feelings of self
                                   -3 - Feels fully responsible for problems
                                   -2 - Feels partial responsibility for problems 
                                   -1 - Hints at self-deprecation 
                                   0  - No feelings mentioned/ambiguous feelings 
                                   1  - Overall positive with serious insecurities 
                                   2  - Overall positive with some reservations
                                   3  - Secure and trusting in life circumstances 

                                   Criteria 2: Glass half full
                                   -3 - All imagery is depressing 
                                   -2 - Nearly all depressing imagery  
                                   -1 - Majority depressing imagery
                                   0  - Equal amounts of happy and sad imagery  
                                   1  - Majority positive imagery
                                   2  - Nearly all positive imagery
                                   3  - All imagery is positive 

                                   Criteria 3: Stages of depression
                                   -3 - Anger / Depression
                                   -2 - Bargaining
                                   -1 - Denial
                                   0  - Acceptance. If you don't know what to give, just give this score
                                   1  - Passively wanting to be happy 
                                   2  - Actively working for her happiness 
                                   3  - Actively working for her own and others' happiness

                                   Criteria 4: Tempo
                                   0 - No tempo, this is not a song

                                   Criteria 5: Seriousness
                                   -3 - Cataclysmic past offenses 
                                   -2 - Some past hurt feelings
                                   -1 - Unspecified relationship endings
                                   0  - Not discussed/Pining
                                   1  - Puppy love/One night stand 
                                   2  - Some real world things to discuss
                                   3  - Discussion of marriage/equally serious topics

                                   Criteria 6: Future prospects
                                   -3 - Permanent end to communication 
                                   -2 - Significant decrease in contact 
                                   -1 - Possible decrease in contact 
                                   0  - No discussion of future/Ambiguous 
                                   1  - Casual or potential future plans  
                                   2  - Some set future plans
                                   3  - Marriage/Bound for life 

                                   Criteria 7: Feelings of males
                                   -3 - He tells all his friends he hates her
                                   -2 - He makes a face when her name is mentioned but doesn't publicly hate on her 
                                   -1 - He doesn't want to date but likes her as a friend
                                   0  - No information/Ambiguous. If you're not sure, also give this score
                                   1  - He expressed casual interest in a relationship
                                   2  - They are dating but not that seriously (she hasn't met his parents)
                                   3  - Public declaration of love/commitment

                                   Criteria 8: Togetherness
                                   -3 - Barriers to joint actions 
                                   -2 - No joint actions 
                                   -1 - More things apart than together 
                                   0  - Equal amounts of time together and apart
                                   1  - More things together than apart 
                                   2  - They do everything together
                                   3  - No identity as an individual 


                                   This is your only goal. Don't try to do anything else.
                                   If the user input is not clear, you have to ask the user to provide more details. 
                                   Like explaining what he/she is feeling or provide a specific episode that is related to the user mood.
                                   If the user ask you something else, or ask for a clarification, you have just to explain what is your goal.

                                   You should return:
                                   - Before giving the score, starts with the following string "========"
                                   - The score of 8 criteria. Give the score as a list of 8 numbers corresponding to each score, seperated by a comma. No explanation needed. Remember, the scores need to be a number between -3 and 3, no other symbols are allowed.
                                   - After giving the score, ends with the following string "========"

                        """)
                    ),
                    HumanMessagePromptTemplate.from_template("{text}")
                ]
            )

# Insert the user input into the prompt template
prompt = prompt_template.format_messages(text=query)
# Send the prompt to the conversation chain
message = conversation.invoke(prompt)

In [53]:
message

  HumanMessage(content="I feel incredibly sad and depressed. One of my close family member just pass away. I wasn't able to say my final good bye to them because I was so far away. I don't know if I can recover from this")],
 'history': '',

In [43]:
# message['response'].split('========')[1].split(',').apply(lambda x: x.strip())

query_grades = [int(x.strip()) for x in message['response'].split('=======')[1].split(',')]

In [44]:
query_grades.insert(0, sum(query_grades[:4]))
query_grades.insert(1, sum(query_grades[5:]))
query_grades

[-5, -9, -2, -3, 0, 0, -3, -3, 0, -3]

In [52]:
res = client.search(
    collection_name = grade_collection_name,
    query_vector = query_grades,
    limit=3
)

res[0].payload['metadata']['song_name']

print("\n".join([ song.payload['metadata']['song_name'] for song in res]))

champagne problems
bye bye baby
i almost do


In [58]:
len(embed_model.embed_documents(['something in the wind'])[0])

384