In [21]:
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pandas as pd

In [22]:
# load the lyrics dataset
data_path = "all_songs_output.csv"
vid_data = pd.read_csv(data_path)
vid_data.head(2)

Unnamed: 0,song_id,album,artist,title,release_date,lyrics
0,1,Battle of New Orleans,Johnny Horton,The Battle Of New Orleans,1959-04-01,In 1814 we took a little trip Along with Colo...
1,2,The Bobby Darin Story,Bobby Darin,Dream Lover,1959-03-05,Every night I hope and pray A dream lover wil...


In [23]:
print(vid_data.shape)

(35206, 6)


Drop too long lyrics

In [24]:
max_len_lyrics = 8000
mask = vid_data['lyrics'].apply(lambda x: len(x) < max_len_lyrics if isinstance(x, str) else False)
vid_data = vid_data[mask]

if len(vid_data[vid_data['lyrics'].apply(lambda x: len(x) if isinstance(x,str) else 0) >= 8000]) == 0:
    print(f"You successfully deleted all songs/rows with lyrics that exceeded {max_len_lyrics} strings.")

You successfully deleted all songs/rows with lyrics that exceeded 8000 strings.


In [25]:
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [26]:
vid_dict_array = vid_data.to_dict(orient='records')

In [27]:
descrip_split = RecursiveCharacterTextSplitter(chunk_size=10000, 
                                               chunk_overlap=20,
                                              length_function=len,
                                              add_start_index=True)

descrip_split.create_documents([vid_dict_array[0]['lyrics']],metadatas=[{'source':vid_dict_array[0]['title']}])


[Document(page_content="In 1814 we took a little trip Along with Colonel Jackson down the mighty Mississip We took a little bacon and we took a little beans And we caught the bloody British in a town near New Orleans   We fired our guns and the British kept a-comin' There wasn't nigh as many as there was a while ago We fired once more and they begin to runnin' On down the Mississippi to the Gulf of Mexico   We looked down a river (Hut-two) And we see'd the British come (Three-four) And there must have been a hundred of'em (Hut-two) Beatin' on the drums (Three-four) They stepped so high (Hut-two) And they made their bugles ring (Three-four) We stood beside our cotton bales (Hut-two) And didn't say a thing (Two-three-four)   We fired our guns and the British kept a-comin' There wasn't nigh as many as there was a while ago We fired once more and they begin to runnin' On down the Mississippi to the Gulf of Mexico   Old Hickory said we could take 'em by surprise (One-hut, two-three-four) If

In [28]:
documents = []

for vid in vid_dict_array:
    title = vid['title']
    artist = vid['artist']
    description = vid['lyrics']
    
    # Ensure description is a string
    if isinstance(description, float):
        description = str(description)  # Convert float to string if necessary
    
    # Check if description is a string before processing
    if isinstance(description, str):
        temp = descrip_split.create_documents([description], metadatas=[{'source': (title, artist)}])
        documents.extend(temp)
    else:
        print(f"Skipping invalid description: {description}")

In [29]:
vector_db = FAISS.from_documents(documents=documents,embedding=embedding)
vector_db.save_local('vector_db_lyrics_all_songs')
#takes 12 min to load with all 6000 songs

In [35]:
# Assuming FAISS supports a similar parameter for deserialization safety
load_db = FAISS.load_local(folder_path="vector_db_lyrics", embeddings=embedding, allow_dangerous_deserialization=True)
vec_retriever = vector_db.as_retriever()

In [73]:
vector_db.similarity_search("I walk in mighty Mississippi", k=3)


[Document(page_content="fight The Battle of New Orleans tonight  They're kissin' in drive-ins, you too, Baltimore Rockin' at dances, land and sea and shore Then it's off to Dee-troit, they all know the score So, a-baby, oh baby, what are we waitin' for?  Oh, 'cause summertime is kissin' time USA  FADE So press your tasty", metadata={'source': "Kissin' Time", 'start_index': 896}),
 Document(page_content="They're smoochin' all over, even in St. Loo So uh-baby get ready, I'm a-kissin' you  Oh, baby 'cause summertime is kissin' time USA So treat me right, a-don't-a make-a me fight The Battle of New Orleans tonight  They're kissin' on beaches, sea to shining sea Smoochin' on benches, near the Christmas", metadata={'source': "Kissin' Time", 'start_index': 301}),
 Document(page_content="fired once more and they begin to runnin' On down the Mississippi to the Gulf of Mexico   We looked down a river (Hut-two) And we see'd the British come (Three-four) And there must have been a hundred of'em (H