In [37]:
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pandas as pd

In [21]:
# load the lyrics dataset
data_path = "all_songs_data_processed.csv"
vid_data = pd.read_csv(data_path)
vid_data.head(2)

Unnamed: 0,album,album_url,artist,featured_artists,lyrics,media,rank,release_date,song_title,song_url,writers,year_float,verbs,nouns,adverbs,corpus,word_counts,unique_word_counts
0,Battle of New Orleans,https://genius.com/albums/Johnny-horton/Battle...,Johnny Horton,[],In 1814 we took a little trip Along with Colo...,[{'native_uri': 'spotify:track:0dwpdcQkeZqpuoA...,1,1959-04-01,The Battle Of New Orleans,https://genius.com/Johnny-horton-the-battle-of...,"[{'api_path': '/artists/561913', 'header_image...",1959.0,take take take catch fire keep be be fire begi...,trip bacon bean town gun comin while river of'...,along as ago once more so as ago once more the...,take little trip Colonel Jackson mighty Missis...,435,155
1,That’s All,https://genius.com/albums/Bobby-darin/That-s-all,Bobby Darin,[],"Oh the shark, babe Has such teeth, dear And he...",[{'native_uri': 'spotify:track:3E5ndyOfO6vFDEI...,2,,Mack The Knife,https://genius.com/Bobby-darin-mack-the-knife-...,"[{'api_path': '/artists/218851', 'header_image...",1959.0,have show have keep know bite billow spread be...,shark babe tooth jackknife babe sight shark to...,pearly just so never never now just just down ...,oh shark babe tooth dear show pearly white jac...,224,145


In [22]:
vid_data.shape

(6292, 18)

In [30]:
len(vid_data[vid_data.lyrics.apply(lambda x: len(x) if isinstance(x,str) else 0) >= 8200])


2

In [33]:
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [35]:
vid_dict_array = vid_data.to_dict(orient='records')


In [55]:
vid_dict_array = vid_dict_array[:100]

In [56]:
descrip_split = RecursiveCharacterTextSplitter(chunk_size=300, 
                                               chunk_overlap=0,
                                              length_function=len,
                                              add_start_index=True)

descrip_split.create_documents([vid_dict_array[2]['lyrics']],metadatas=[{'source':vid_dict_array[2]['song_title']}])


[Document(page_content="Over and over I tried to prove my love to you Over and over What more can I do  Over and over My friend says I'm a fool But over and over I'll be a fool for you  Cause you got personality Walk (with personality) Talk (with personality) Smile (with personality) Charm (with personality) Love (with", metadata={'source': 'Personality', 'start_index': 0}),
 Document(page_content="personality) And plus you've got A great big heart  So over and over Oh, I'll be a fool for you Now over and over What more can I do  Cause you got personality Walk (with personality) Talk (with personality) Smile (with personality) Charm (with personality) Love (with personality) And plus you've", metadata={'source': 'Personality', 'start_index': 297}),
 Document(page_content="got A great big heart  So over and over Whoa, I'll be a fool for you Now over and over What more can I do  Over and over I said that I love you Over and over, honey Now it's the truth  Over and over They still say I'm

In [68]:
documents = []

for vid in vid_dict_array:
    title = vid['song_title']
    description = vid['lyrics']
    
    # Ensure description is a string
    if isinstance(description, float):
        description = str(description)  # Convert float to string if necessary
    
    # Check if description is a string before processing
    if isinstance(description, str):
        temp = descrip_split.create_documents([description], metadatas=[{'source': title}])
        documents.extend(temp)
    else:
        print(f"Skipping invalid description: {description}")

In [69]:
vector_db = FAISS.from_documents(documents=documents,embedding=embedding)
vector_db.save_local('vector_db_lyrics')

In [70]:
# Assuming FAISS supports a similar parameter for deserialization safety
load_db = FAISS.load_local(folder_path="vector_db_lyrics", embeddings=embedding, allow_dangerous_deserialization=True)
vec_retriever = vector_db.as_retriever()


In [73]:
vector_db.similarity_search("I walk in mighty Mississippi", k=3)


[Document(page_content="fight The Battle of New Orleans tonight  They're kissin' in drive-ins, you too, Baltimore Rockin' at dances, land and sea and shore Then it's off to Dee-troit, they all know the score So, a-baby, oh baby, what are we waitin' for?  Oh, 'cause summertime is kissin' time USA  FADE So press your tasty", metadata={'source': "Kissin' Time", 'start_index': 896}),
 Document(page_content="They're smoochin' all over, even in St. Loo So uh-baby get ready, I'm a-kissin' you  Oh, baby 'cause summertime is kissin' time USA So treat me right, a-don't-a make-a me fight The Battle of New Orleans tonight  They're kissin' on beaches, sea to shining sea Smoochin' on benches, near the Christmas", metadata={'source': "Kissin' Time", 'start_index': 301}),
 Document(page_content="fired once more and they begin to runnin' On down the Mississippi to the Gulf of Mexico   We looked down a river (Hut-two) And we see'd the British come (Three-four) And there must have been a hundred of'em (H