In [3]:
!pip3 install -U sentence-transformers
!pip3 install lyricsgenius



In [4]:
# Imports and setup
import pandas as pd
import numpy as np
import torch
import requests
import lyricsgenius
import dotenv 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import logging
from langdetect import detect
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /Users/anita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/anita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Load the fine-tuned BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

### Utility Functions
mean pooling, preprocessing lyrics, and encoding lyrics.

In [6]:
# Utility functions
# Expanding Contractions
CONTRACTION_MAP = {
    "can't": "cannot",
    "won't": "will not",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'m": " am",
    "'ll": " will",
    "'d": " would",
    "'ve": " have",
    "o'clock": "of the clock",
    "ma'am": "madam",
    "let's": "let us"
}

def expand_contractions(lyrics, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match.lower() if match.lower() in contraction_mapping else match)
        return first_char + expanded_contraction[1:]
    expanded_lyrics = contractions_pattern.sub(expand_match, lyrics)
    return expanded_lyrics

def preprocess_lyrics(lyrics):
    lyrics = expand_contractions(lyrics)  # Make sure to use 'lyrics' not 'text'
    lyrics = re.sub(r'\[(.*?)\]', '', lyrics)  # remove [Verse], [Chorus] tags
    lyrics = lyrics.lower()  # lowercase
    tokens = word_tokenize(lyrics)  # tokenize
    return ' '.join(tokens)  # Return the processed lyrics as a single string

#### Preprocess and Encode Lyrics

In [7]:
logging.basicConfig(level=logging.INFO)

logging.info("Starting to read CSV file")
df = pd.read_csv('music_data.csv')  
logging.info(f"Finished reading CSV file. DataFrame shape: {df.shape}")

logging.info("Starting to preprocess lyrics")
df['preprocessed_lyrics'] = df['lyrics'].apply(preprocess_lyrics)
logging.info("Finished preprocessing lyrics")

INFO:root:Starting to read CSV file


INFO:root:Finished reading CSV file. DataFrame shape: (3744, 5)
INFO:root:Starting to preprocess lyrics
INFO:root:Finished preprocessing lyrics


In [8]:
df[['lyrics', 'preprocessed_lyrics']].head(10)

Unnamed: 0,lyrics,preprocessed_lyrics
0,[Verse 1]\nThought I'd end up with Sean\nBut h...,thought i'would end up with sean but he wasnno...
1,"[Verse 1]\nYeah, breakfast at Tiffany's and bo...","yeah , breakfast at tiffany'is and bottles of ..."
2,"[Chorus]\nYou, you love it how I move you\nYou...","you , you love it how i move you you love it h..."
3,[Intro: Ariana Grande & Nicki Minaj]\nI've bee...,i'have been here all night ( ariana ) i'have b...
4,"[Intro]\nRight now, I'm in a state of mind\nI ...","right now , i'am in a state of mind i wan na b..."
5,[Intro]\n​​lacigam gnihtemos od oT\n​​thgin la...,​​lacigam gnihtemos od ot ​​thgin laiceps ruoy...
6,[Verse 1]\nYou got me some type of way (Hmm)\n...,you got me some type of way ( hmm ) ainnot use...
7,[Verse 1]\nHeaven sent you to me\nI'm just hop...,heaven sent you to me i'am just hopin ’ i donn...
8,[Intro]\nHmm\n\n[Verse 1]\nYou might think I'm...,hmm you might think i'am crazy the way i'have ...
9,"[Verse 1]\nStep up, the two of us, nobody know...","step up , the two of us , nobody knows us get ..."


In [9]:
logging.info("Starting to get embeddings")
df['embeddings'] = list(model.encode(df['lyrics'].tolist(), show_progress_bar=True))
logging.info("Finished getting embeddings")

INFO:root:Starting to get embeddings


Batches:   0%|          | 0/117 [00:00<?, ?it/s]

INFO:root:Finished getting embeddings


In [10]:
df[['artist', 'preprocessed_lyrics', 'embeddings']].head(10)

Unnamed: 0,artist,preprocessed_lyrics,embeddings
0,Ariana Grande,thought i'would end up with sean but he wasnno...,"[-0.07796759, -0.047370087, 0.06382991, -0.046..."
1,Ariana Grande,"yeah , breakfast at tiffany'is and bottles of ...","[-0.09182351, 0.038527973, 0.051597822, -0.053..."
2,Ariana Grande,"you , you love it how i move you you love it h...","[-0.035600673, -0.045947228, 0.06202742, -0.04..."
3,Ariana Grande,i'have been here all night ( ariana ) i'have b...,"[-0.04670759, -0.03384533, 0.02231615, -0.0330..."
4,Ariana Grande,"right now , i'am in a state of mind i wan na b...","[-0.09957284, -0.02273628, 0.07063174, -0.0655..."
5,Ariana Grande,​​lacigam gnihtemos od ot ​​thgin laiceps ruoy...,"[-0.044333555, -0.046792664, 0.071543686, -0.0..."
6,Ariana Grande,you got me some type of way ( hmm ) ainnot use...,"[-0.058477826, -0.049745448, 0.07734181, -0.04..."
7,Ariana Grande,heaven sent you to me i'am just hopin ’ i donn...,"[-0.060370278, -0.05460217, 0.071599156, -0.04..."
8,Ariana Grande,hmm you might think i'am crazy the way i'have ...,"[-0.09731813, -0.04326297, 0.026763892, -0.005..."
9,Ariana Grande,"step up , the two of us , nobody knows us get ...","[-0.10925642, -0.04081988, 0.06588498, 0.00217..."


In [11]:
def search_lyrics_sentence_transformer(query, df, model):
    # Encode the query to get the embedding
    query_embedding = model.encode([query])[0]  
    # Calculate cosine similarities between the query and all lyrics embeddings
    similarities = cosine_similarity([query_embedding], df['embeddings'].tolist()).flatten()
    # Get indices of the songs with the highest similarity scores
    top_indices = similarities.argsort()[-5:][::-1]
    return df.iloc[top_indices]



In [23]:
query = "so it's gonna be forever or it's gonna go down in flames"
results = search_lyrics_sentence_transformer(query, df, model)
print(results[['artist', 'title']])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

            artist                     title
732       Coldplay              Up in Flames
3415  Taylor Swift               Blank Space
818       Coldplay              Ring of Fire
990          Drake                 Fireworks
2225    Katy Perry  Not the End of the World
