In [1]:
!pip3 install -U sentence-transformers
!pip3 install lyricsgenius



In [2]:
# Imports and setup
import pandas as pd
import numpy as np
import torch
import requests
import lyricsgenius
import dotenv 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import logging
from langdetect import detect
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /Users/anita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/anita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the fine-tuned BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

### Clean code 

In [4]:
df = pd.read_csv('music_data.csv')  
print("Data loaded successfully!")
print("Sample data:")
df

Data loaded successfully!
Sample data:


Unnamed: 0,artist,title,id,release_date,lyrics
0,Ariana Grande,"​thank u, next",4063065,2018-11-03,[Verse 1]\nThought I'd end up with Sean\nBut h...
1,Ariana Grande,7 rings,4067762,2019-01-18,"[Verse 1]\nYeah, breakfast at Tiffany's and bo..."
2,Ariana Grande,​God is a woman,3681280,2018-07-13,"[Chorus]\nYou, you love it how I move you\nYou..."
3,Ariana Grande,Side To Side,2457495,2016-05-20,[Intro: Ariana Grande & Nicki Minaj]\nI've bee...
4,Ariana Grande,​​no tears left to cry,3649172,2018-04-20,"[Intro]\nRight now, I'm in a state of mind\nI ..."
...,...,...,...,...,...
3739,Taylor Swift,"Taylor Swift - no body, no crime (Traducción a...",6274728,2020-12-11,[Intro: HAIM]\nÉl lo hizo\nÉl lo hizo\n\n[Vers...
3740,Taylor Swift,Welcome Back Grunwald,6226864,,Turn WYCD on\nYou're\nOn your Grunwald\nBack f...
3741,Taylor Swift,Tolerate it (Polskie Tłumaczenie),6315848,2020-12-11,[Zwrotka 1]\nSiedzę i patrzę jak czytasz z gło...
3742,Taylor Swift,Find you,6209316,,Trying just like they say\nJust taking the ste...


In [5]:
null_values = df.isnull().sum()
print(null_values)

artist            0
title             0
id                0
release_date    993
lyrics            0
dtype: int64


In [6]:
#Remove null values
df= df.dropna()

In [7]:
duplicate_rows = df.duplicated().sum()
print(duplicate_rows)

0


In [8]:
from langdetect import detect

def is_english(row):
    for column in row:
        try:
            if detect(str(column)) != 'en':
                return False
        except:
            pass
    return True

mask = df.apply(is_english, axis=1)
df = df[mask]

In [9]:
print("Filtered and cleaned data:")
df.head()

Filtered and cleaned data:


Unnamed: 0,artist,title,id,release_date,lyrics
517,Charlie Puth,We Don’t Talk Anymore,2388373,2016-05-24,"[Chorus: Charlie Puth]\nWe don't talk anymore,..."
518,Charlie Puth,How Long,3236636,2017-10-05,"[Intro]\nAlright\nOoh, yeah\n\n[Verse 1]\nI'll..."
519,Charlie Puth,Marvin Gaye,706298,2015-02-10,[Intro: Charlie Puth]\nLet's Marvin Gaye and g...
520,Charlie Puth,One Call Away,2276633,2015-08-20,[Chorus]\nI'm only one call away\nI'll be ther...
522,Charlie Puth,The Way I Am,3598344,2018-05-03,"[Verse 1]\nYeah, maybe I'ma get a little anxio..."


### Preprocessing 

In [10]:
# Utility functions
# Expanding Contractions
CONTRACTION_MAP = {
    "can't": "cannot",
    "won't": "will not",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'m": " am",
    "'ll": " will",
    "'d": " would",
    "'ve": " have",
    "o'clock": "of the clock",
    "ma'am": "madam",
    "let's": "let us"
}

def expand_contractions(lyrics, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match.lower() if match.lower() in contraction_mapping else match)
        return first_char + expanded_contraction[1:]
    expanded_lyrics = contractions_pattern.sub(expand_match, lyrics)
    return expanded_lyrics

def preprocess_lyrics(lyrics):
    lyrics = expand_contractions(lyrics)  # Make sure to use 'lyrics' not 'text'
    lyrics = re.sub(r'\[(.*?)\]', '', lyrics)  # remove [Verse], [Chorus] tags
    lyrics = lyrics.lower()  # lowercase
    tokens = word_tokenize(lyrics)  # tokenize
    return ' '.join(tokens)  # Return the processed lyrics as a single string

#### Encode Lyrics

In [11]:
logging.basicConfig(level=logging.INFO)
logging.info("Starting to preprocess lyrics")
df['preprocessed_lyrics'] = df['lyrics'].apply(preprocess_lyrics)
logging.info("Finished preprocessing lyrics")

INFO:root:Starting to preprocess lyrics
INFO:root:Finished preprocessing lyrics


In [12]:
df[['lyrics', 'preprocessed_lyrics']].head(10)

Unnamed: 0,lyrics,preprocessed_lyrics
517,"[Chorus: Charlie Puth]\nWe don't talk anymore,...","we donnot talk anymore , we donnot talk anymor..."
518,"[Intro]\nAlright\nOoh, yeah\n\n[Verse 1]\nI'll...","alright ooh , yeah i'will admit , i was wrong ..."
519,[Intro: Charlie Puth]\nLet's Marvin Gaye and g...,let us marvin gaye and get it on you got the h...
520,[Chorus]\nI'm only one call away\nI'll be ther...,i'am only one call away i'will be there to sav...
522,"[Verse 1]\nYeah, maybe I'ma get a little anxio...","yeah , maybe i'ama get a little anxious maybe ..."
523,"[Verse 1: Charlie Puth]\nNo, this is not goodb...","no , this is not goodbye i swear that i'am gon..."
526,[Verse 1]\nIt started with a kiss\nOn your mam...,it started with a kiss on your mama'is couch 2...
528,[Chorus]\nI warned myself that I shouldn't pla...,i warned myself that i shouldnnot play with fi...
531,"[Verse 1]\nHe's such a nice boy, so well-manne...","he'is such a nice boy , so well-mannered he'is..."
533,"[Chorus]\nDon't run from me, river\nDon't run ...","donnot run from me , river donnot run from me ..."


In [13]:
logging.info("Starting to get embeddings")
df['embeddings'] = list(model.encode(df['lyrics'].tolist(), show_progress_bar=True))
logging.info("Finished getting embeddings")

INFO:root:Starting to get embeddings


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

INFO:root:Finished getting embeddings


In [14]:
df[['artist', 'preprocessed_lyrics', 'embeddings']].head(10)

Unnamed: 0,artist,preprocessed_lyrics,embeddings
517,Charlie Puth,"we donnot talk anymore , we donnot talk anymor...","[-0.06933179, -0.05280156, 0.096682414, -0.032..."
518,Charlie Puth,"alright ooh , yeah i'will admit , i was wrong ...","[-0.08532193, -0.07560885, 0.11776075, 0.01836..."
519,Charlie Puth,let us marvin gaye and get it on you got the h...,"[-0.12914756, -0.023826854, 0.03429907, -0.005..."
520,Charlie Puth,i'am only one call away i'will be there to sav...,"[-0.08759198, -0.020269403, -0.0022983477, 0.0..."
522,Charlie Puth,"yeah , maybe i'ama get a little anxious maybe ...","[-0.023337308, -0.021497937, 0.1249926, -0.014..."
523,Charlie Puth,"no , this is not goodbye i swear that i'am gon...","[-0.020676365, -0.031019937, 0.101447135, 0.03..."
526,Charlie Puth,it started with a kiss on your mama'is couch 2...,"[-0.07356925, -0.06716579, 0.0592213, 0.053288..."
528,Charlie Puth,i warned myself that i shouldnnot play with fi...,"[0.014884363, 0.018558545, 0.037044585, 0.0125..."
531,Charlie Puth,"he'is such a nice boy , so well-mannered he'is...","[-0.035937082, 0.032252643, 0.039746188, 0.013..."
533,Charlie Puth,"donnot run from me , river donnot run from me ...","[-0.019787563, -0.02365343, 0.11032831, -0.043..."


In [15]:
def search_lyrics_sentence_transformer(query, df, model):
    # Encode the query to get the embedding
    query_embedding = model.encode([query])[0]  
    # Calculate cosine similarities between the query and all lyrics embeddings
    similarities = cosine_similarity([query_embedding], df['embeddings'].tolist()).flatten()
    # Get indices of the songs with the highest similarity scores
    top_indices = similarities.argsort()[-5:][::-1]
    return df.iloc[top_indices]



### Search

In [18]:
query = "you are somebody that i don't know but you're takin' shots at me like it's patrón"
results = search_lyrics_sentence_transformer(query, df, model)
print(results[['artist', 'title', 'release_date']])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

            artist                               title release_date
3412  Taylor Swift               You Need To Calm Down   2019-06-14
603   Charlie Puth       Haters Follow Me Like Twitter   2011-01-01
3428  Taylor Swift                            Gorgeous   2017-10-20
3628  Taylor Swift   The Making Of A Song - ‘Gorgeous’   2017-11-01
2335    Katy Perry  Hot N Cold (Innerpartysystem Main)   2008-11-24
