In [3]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import PromptTemplate,AnswerParser,BM25Retriever,PromptNode
from haystack.pipelines import Pipeline
from haystack import Document
import kaggle
import pandas as pd
import os
from dotenv import load_dotenv

# Loading up OpenAI Api Key from environment variables
load_dotenv("../.env")
load_dotenv()
MY_API_KEY = os.getenv("OPENAI_API_KEY")

# Initializing document store, use_bm25 when we can't use GPU
document_store = InMemoryDocumentStore(use_bm25=True)

# Downloading lyrics database from Kaggle
kaggle_dataset_id = "deepshah16/song-lyrics-dataset"
kaggle.api.authenticate()
kaggle.api.dataset_download_files(kaggle_dataset_id, path='../', unzip=True)

# We chose two artists for starters
df_bts = pd.read_csv("../csv/BTS.csv")
df_lady_gaga = pd.read_csv("../csv/LadyGaga.csv")

df_lyrics = pd.concat([df_bts, df_lady_gaga], axis=0)
df_lyrics.head()


Unnamed: 0.1,Unnamed: 0,Artist,Title,Album,Year,Date,Lyric
0,0,BTS (방탄소년단),Dynamite,BE,2020.0,2020-08-21,jungkook 'cause i i i'm in the stars tonight s...
1,1,BTS (방탄소년단),FAKE LOVE,LOVE YOURSELF 轉 ‘Tear’,2018.0,2018-05-18,방탄소년단의 fake love 가사 v jungkook 널 위해서라면 난 슬퍼도...
2,2,BTS (방탄소년단),MIC Drop (Steve Aoki Remix),,2017.0,2017-11-24,mic drop steve aoki remix 의해 방탄소년단 가사 jhope ...
3,3,BTS (방탄소년단),전하지 못한 진심 (The Truth Untold),LOVE YOURSELF 轉 ‘Tear’,2018.0,2018-05-18,방탄소년단의 전하지 못한 진심 가사 v 외로움이 가득히 피어있는 이 garden...
4,4,BTS (방탄소년단),봄날 (Spring Day),You Never Walk Alone,2017.0,2017-02-13,rm 보고 싶다 이렇게 말하니까 더 보고 싶다 너희 사진을 보고 있어도 보고 싶다 ...


In [4]:
# Renaming first column to id and generating the correct numbers
col_names = df_lyrics.columns.to_list()
col_names[0] = "id"
df_lyrics.columns = col_names
df_lyrics.reset_index(drop=True, inplace=True)
df_lyrics["id"] = df_lyrics.index + 1

df_lyrics.head()

Unnamed: 0,id,Artist,Title,Album,Year,Date,Lyric
0,1,BTS (방탄소년단),Dynamite,BE,2020.0,2020-08-21,jungkook 'cause i i i'm in the stars tonight s...
1,2,BTS (방탄소년단),FAKE LOVE,LOVE YOURSELF 轉 ‘Tear’,2018.0,2018-05-18,방탄소년단의 fake love 가사 v jungkook 널 위해서라면 난 슬퍼도...
2,3,BTS (방탄소년단),MIC Drop (Steve Aoki Remix),,2017.0,2017-11-24,mic drop steve aoki remix 의해 방탄소년단 가사 jhope ...
3,4,BTS (방탄소년단),전하지 못한 진심 (The Truth Untold),LOVE YOURSELF 轉 ‘Tear’,2018.0,2018-05-18,방탄소년단의 전하지 못한 진심 가사 v 외로움이 가득히 피어있는 이 garden...
4,5,BTS (방탄소년단),봄날 (Spring Day),You Never Walk Alone,2017.0,2017-02-13,rm 보고 싶다 이렇게 말하니까 더 보고 싶다 너희 사진을 보고 있어도 보고 싶다 ...


In [5]:
# Document store expects a content column
df_lyrics = df_lyrics.rename(columns={"Lyric": "content"})

# Converting dataframe to list of dictionaires
document_store.write_documents(df_lyrics.to_dict('records'))

Updating BM25 representation...: 100%|██████████| 680/680 [00:00<00:00, 14419.25 docs/s]


In [6]:
rag_prompt = PromptTemplate(
    prompt="""Synthesize a brief answer from the following text for the given question.
            Provide a clear and concise response related to music lyrics and the artists provided.
            Your answer should be in your own words and be no longer than 50 words.
            \n\n Music Lyrics: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)

In [7]:
retriever = BM25Retriever(document_store=document_store, top_k=2)
pn = PromptNode("gpt-3.5-turbo", 
                api_key=MY_API_KEY, 
                model_kwargs={"stream":False},
                default_prompt_template=rag_prompt)


In [8]:
# Setting up the pipeline
pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=pn, name="prompt_node", inputs=["retriever"])

Some test runs. It seems it doesn't read BTS songs since they're in Korean.

In [9]:
output = pipe.run(query="What songs talk about being born and who is the artist?")

print(output["answers"][0].answer)


The song "Born This Way" by Lady Gaga talks about being born and embracing one's true self.


In [14]:
output = pipe.run(query="Which songs talk about hormones and who is the artist?")

print(output["answers"][0].answer)

There is no mention of any songs specifically about hormones in the given text. Therefore, we cannot determine which songs talk about hormones or the artist behind them.


In [12]:
output = pipe.run(query="Which songs talk about self-esteem and who is the artist?")

print(output["answers"][0].answer)

The songs that talk about self-esteem are "Born This Way" and "This One's for You." The artist is Lady Gaga.
