In [41]:
import pandas as pd
from langchain.llms import OpenAI
# from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [32]:
tmp = pd.read_csv("lyrics_wf.csv")
tmp.head()

Unnamed: 0,URL,Lyrics,Title,Album Title
0,https://mojim.com/cny100019x82x1.htm,\n王菲\n\n\n\n\n\n\n归途有风\n\n\n电影 万里归途 主题曲\n作词：唐恬...,归途有风,归途有风
1,https://mojim.com/cny100019x81x1.htm,\n王菲\n\n\n\n\n\n\n如愿\n\n\n作词：唐恬\n作曲：钱雷\n你是 遥遥的...,如愿,如愿
2,https://mojim.com/cny100019x79x1.htm,\n王菲\n\n\n\n\n\n\n偶遇\n\n\n电影 邪不压正 宣传曲\n作词：林珺帆\...,偶遇,电影 邪不压正
3,https://mojim.com/cny100019x77x1.htm,\n王菲\n\n\n\n\n\n\n无问西东\n\n\n作词：彭青\n作曲：彭青\n编曲：彭...,无问西东,电影《无问西东》推广曲
4,https://mojim.com/cny100019x75x1.htm,\n王菲\n\n\n\n\n\n\n你在终点等我\n\n\n作词：姚若龙\n作曲：陈小霞\n...,你在终点等我,电影《从你的全世界路过》片尾曲


## Text Cleaning

In [82]:
import re
# define the regular expression pattern to match the timestamps
pattern = r"\[\d{2}:\d{2}\.\d{2}\]"

# use the re.sub() function to remove the timestamps from the text
tmp['Lyrics'] = tmp['Lyrics'].apply(lambda x: re.sub(pattern, "", x))

One row per line

In [84]:
tmp.replace(r"\n{2,}", "\n", regex=True, inplace=True)
tmp.head()
new = tmp.assign(lyrics = tmp['Lyrics'].str.split("\n")).explode('lyrics')
new.head()


Unnamed: 0,URL,Lyrics,Title,Album Title,lyrics
0,https://mojim.com/cny100019x82x1.htm,\n王菲\n归途有风\n电影 万里归途 主题曲\n作词：唐恬@勇士音乐\n作曲：钱雷@勇士音...,归途有风,归途有风,
0,https://mojim.com/cny100019x82x1.htm,\n王菲\n归途有风\n电影 万里归途 主题曲\n作词：唐恬@勇士音乐\n作曲：钱雷@勇士音...,归途有风,归途有风,王菲
0,https://mojim.com/cny100019x82x1.htm,\n王菲\n归途有风\n电影 万里归途 主题曲\n作词：唐恬@勇士音乐\n作曲：钱雷@勇士音...,归途有风,归途有风,归途有风
0,https://mojim.com/cny100019x82x1.htm,\n王菲\n归途有风\n电影 万里归途 主题曲\n作词：唐恬@勇士音乐\n作曲：钱雷@勇士音...,归途有风,归途有风,电影 万里归途 主题曲
0,https://mojim.com/cny100019x82x1.htm,\n王菲\n归途有风\n电影 万里归途 主题曲\n作词：唐恬@勇士音乐\n作曲：钱雷@勇士音...,归途有风,归途有风,作词：唐恬@勇士音乐


In [87]:
min_chunk_size = 100
new_df = pd.DataFrame(columns=["Title", "Album", "URL","Chunk"])
for index, row in tmp.head().iterrows():
    # split the text in the current row into chunks while respecting the newline characters
    chunks = []
    for line in row["Lyrics"].split("\n"):
        if len(line) >= min_chunk_size:
            chunks.append(line)
        else:
            merged_chunk = line
            for next_line in row["Lyrics"][len(line)+1:].split("\n"):
                if len(merged_chunk) + len(next_line) + 1 < min_chunk_size:
                    merged_chunk += "\n" + next_line
                else:
                    chunks.append(merged_chunk)
                    merged_chunk = next_line
            chunks.append(merged_chunk)
    # create a new dataframe with the resulting chunks and metadata
    chunk_df = pd.DataFrame({
        "Title": [row["Title"]] * len(chunks),
        "URL": [row["URL"]] * len(chunks),
        "Album": [row["Album Title"]] * len(chunks),
        "Chunk": chunks
    })
    new_df = pd.concat([new_df, chunk_df], ignore_index=True)

# display the resulting dataframe
print(new_df)


       Title            Album                                   URL  \
0       归途有风             归途有风  https://mojim.com/cny100019x82x1.htm   
1       归途有风             归途有风  https://mojim.com/cny100019x82x1.htm   
2       归途有风             归途有风  https://mojim.com/cny100019x82x1.htm   
3       归途有风             归途有风  https://mojim.com/cny100019x82x1.htm   
4       归途有风             归途有风  https://mojim.com/cny100019x82x1.htm   
...      ...              ...                                   ...   
2993  你在终点等我  电影《从你的全世界路过》片尾曲  https://mojim.com/cny100019x75x1.htm   
2994  你在终点等我  电影《从你的全世界路过》片尾曲  https://mojim.com/cny100019x75x1.htm   
2995  你在终点等我  电影《从你的全世界路过》片尾曲  https://mojim.com/cny100019x75x1.htm   
2996  你在终点等我  电影《从你的全世界路过》片尾曲  https://mojim.com/cny100019x75x1.htm   
2997  你在终点等我  电影《从你的全世界路过》片尾曲  https://mojim.com/cny100019x75x1.htm   

                                                  Chunk  
0     \n王菲\n归途有风\n电影 万里归途 主题曲\n作词：唐恬@勇士音乐\n作曲：钱雷@勇士音...  
1     才可明白 为何而来\n要 放开过 勿放的手\n要 

## Retrieval

In [90]:
# loader = CSVLoader(file_path="lyrics_wf.csv", source_column="Lyrics")
# loader = DataFrameLoader(new, page_content_column='lyrics')
loader = DataFrameLoader(new_df, page_content_column='Chunk')
data = loader.load()

In [91]:
# Split documents into smaller chunks
# text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# texts = text_splitter.split_documents(data)

# Initialize OpenAI embeddings
embeddings_model = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(data, embeddings_model, persist_directory="./chroma_db")
vectorstore.persist()

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


In [92]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [93]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(), memory=memory)

In [94]:
retriever=vectorstore.as_retriever()
lyrics = retriever.get_relevant_documents(
    query="今天好开心"
)

In [96]:
print(lyrics[0].page_content)

~而我将爱你所爱的人间~
～愿你所愿的笑颜~
~你的手我蹒跚在牵~
~请带~我去~明天~～
~如果说你曾苦过我的甜~
～我愿活成你~的愿~
~愿不枉啊愿勇往啊~
~这盛世每~一天～～～


In [98]:
lyrics = retriever.get_relevant_documents(
    query="爱上了不该爱的人，又离不开"
)
print(lyrics[0].page_content)

无妄的多情 不忘也多余 不如不散不聚
相爱无痕迹 相忘得无忧无虑
伤害无痕迹 痛苦得无忧无虑

