In [1]:
import pandas as pd
from langchain.llms import OpenAI
# from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [11]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
tmp = pd.read_csv("lyrics_wf.csv")
tmp.head()

Unnamed: 0,URL,Lyrics,Title,Album Title
0,https://mojim.com/cny100019x82x1.htm,\n王菲\n\n\n\n\n\n\n归途有风\n\n\n电影 万里归途 主题曲\n作词：唐恬...,归途有风,归途有风
1,https://mojim.com/cny100019x81x1.htm,\n王菲\n\n\n\n\n\n\n如愿\n\n\n作词：唐恬\n作曲：钱雷\n你是 遥遥的...,如愿,如愿
2,https://mojim.com/cny100019x79x1.htm,\n王菲\n\n\n\n\n\n\n偶遇\n\n\n电影 邪不压正 宣传曲\n作词：林珺帆\...,偶遇,电影 邪不压正
3,https://mojim.com/cny100019x77x1.htm,\n王菲\n\n\n\n\n\n\n无问西东\n\n\n作词：彭青\n作曲：彭青\n编曲：彭...,无问西东,电影《无问西东》推广曲
4,https://mojim.com/cny100019x75x1.htm,\n王菲\n\n\n\n\n\n\n你在终点等我\n\n\n作词：姚若龙\n作曲：陈小霞\n...,你在终点等我,电影《从你的全世界路过》片尾曲


## Text Cleaning

In [3]:
import re
# define the regular expression pattern to match the timestamps
pattern = r"\[\d{2}:\d{2}\.\d{2}\]"

# use the re.sub() function to remove the timestamps from the text
tmp['Lyrics'] = tmp['Lyrics'].apply(lambda x: re.sub(pattern, "", x))

1. One row per line
- notes: not a sentence, only return word phrases

In [4]:
tmp.replace(r"\n{2,}", "\n", regex=True, inplace=True)
new = tmp.assign(lyrics = tmp['Lyrics'].str.split("\n")).explode('lyrics')

2. Specify minimal chunk size, merge the subsequent chunks when too small

In [5]:
min_chunk_size = 100
new_df = pd.DataFrame(columns=["Title", "Album", "URL","Chunk"])
for index, row in tmp.iterrows():
    # split the text in the current row into chunks while respecting the newline characters
    chunks = []
    for line in row["Lyrics"].split("\n"):
        if len(line) >= min_chunk_size:
            chunks.append(line)
        else:
            merged_chunk = line
            for next_line in row["Lyrics"][len(line)+1:].split("\n"):
                if len(merged_chunk) + len(next_line) + 1 < min_chunk_size:
                    merged_chunk += "\n" + next_line
                else:
                    chunks.append(merged_chunk)
                    merged_chunk = next_line
            chunks.append(merged_chunk)
    # create a new dataframe with the resulting chunks and metadata
    chunk_df = pd.DataFrame({
        "Title": [row["Title"]] * len(chunks),
        "URL": [row["URL"]] * len(chunks),
        "Album": [row["Album Title"]] * len(chunks),
        "Chunk": chunks
    })
    new_df = pd.concat([new_df, chunk_df], ignore_index=True)

# display the resulting dataframe
print(new_df)


       Title Album                                    URL  \
0       归途有风  归途有风   https://mojim.com/cny100019x82x1.htm   
1       归途有风  归途有风   https://mojim.com/cny100019x82x1.htm   
2       归途有风  归途有风   https://mojim.com/cny100019x82x1.htm   
3       归途有风  归途有风   https://mojim.com/cny100019x82x1.htm   
4       归途有风  归途有风   https://mojim.com/cny100019x82x1.htm   
...      ...   ...                                    ...   
104659   致青春    暂存  https://mojim.com/cny100019x58x15.htm   
104660   致青春    暂存  https://mojim.com/cny100019x58x15.htm   
104661   致青春    暂存  https://mojim.com/cny100019x58x15.htm   
104662   致青春    暂存  https://mojim.com/cny100019x58x15.htm   
104663   致青春    暂存  https://mojim.com/cny100019x58x15.htm   

                                                    Chunk  
0       \n王菲\n归途有风\n电影 万里归途 主题曲\n作词：唐恬@勇士音乐\n作曲：钱雷@勇士音...  
1       才可明白 为何而来\n要 放开过 勿放的手\n要 千山万水 懂得泪流\n要风起 要别离 要万...  
2       让它告诉我\n抉择多难 都已做过\n不问得失 无悔对错\n让月光 带我回家\n让来路 带我回...  
3       更多更详尽歌词 在 ※ Mojim.c

## Retrieval

In [6]:
# loader = CSVLoader(file_path="lyrics_wf.csv", source_column="Lyrics")
# loader = DataFrameLoader(new, page_content_column='lyrics')
loader = DataFrameLoader(new_df, page_content_column='Chunk')
data = loader.load()

In [13]:
COHERE_API_KEY = os.getenv('COHERE_API_KEY')

In [16]:
COHERE_API_KEY

In [15]:
# Split documents into smaller chunks
# text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# texts = text_splitter.split_documents(data)

# Initialize OpenAI embeddings
# embeddings_model = OpenAIEmbeddings()

from langchain.embeddings.cohere import CohereEmbeddings

embeddings_model = CohereEmbeddings(model="embed-english-v2.0", cohere_api_key=COHERE_API_KEY)
vectorstore = Chroma.from_documents(data, embeddings_model, persist_directory="./chroma_db")
vectorstore.persist()

ValidationError: 1 validation error for CohereEmbeddings
__root__
  Did not find cohere_api_key, please add an environment variable `COHERE_API_KEY` which contains it, or pass  `cohere_api_key` as a named parameter. (type=value_error)

In [92]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [93]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(), memory=memory)

In [94]:
retriever=vectorstore.as_retriever()
lyrics = retriever.get_relevant_documents(
    query="今天好开心"
)

In [96]:
print(lyrics[0].page_content)

~而我将爱你所爱的人间~
～愿你所愿的笑颜~
~你的手我蹒跚在牵~
~请带~我去~明天~～
~如果说你曾苦过我的甜~
～我愿活成你~的愿~
~愿不枉啊愿勇往啊~
~这盛世每~一天～～～


In [98]:
lyrics = retriever.get_relevant_documents(
    query="爱上了不该爱的人，又离不开"
)
print(lyrics[0].page_content)

无妄的多情 不忘也多余 不如不散不聚
相爱无痕迹 相忘得无忧无虑
伤害无痕迹 痛苦得无忧无虑



In [None]:
lyrics = retriever.get_relevant_documents(
    query="爱上了不该爱的人，又离不开"
)
print(lyrics[0].page_content)

In [102]:
lyrics[0].metadata['Title']

'偶遇'

# Spotify

get spotify link for the result

In [108]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os

# set up the Spotify API client
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

def get_spotify_link(lyrics):
    song_name = lyrics[0].metadata['Title']
    singer_name = "王菲"
    results = sp.search(q=song_name + " " + singer_name, type="track")

    # extract the Spotify link to the first result
    if len(results["tracks"]["items"]) > 0:
        song_link = results["tracks"]["items"][0]["external_urls"]["spotify"]
        return song_link
    else:
        return "No results found for song name: " + song_name


# Put it together

In [111]:
def chatbot_results(query):
    lyrics = retriever.get_relevant_documents(query=query)
    link = get_spotify_link(lyrics)
    return(lyrics[0].page_content, link)

In [112]:
# chatbot_results("爱上了不该爱的人，又离不开")
chatbot_results("旋转木马好玩吗")

('~山河无恙～~烟火寻常～\n~可是你如愿的眺望～\n~孩子们啊～~安睡梦乡～\n~像你深爱的~那样～～～\n～◆☆◇~而我将\n梦你所梦的团圆～\n愿你所愿的永~远～\n走你所走的长~路~\n~这样的爱~你啊～',
 'https://open.spotify.com/track/4x9retP0JqKa35zZZhNhNS')