In [1]:
# Install necessary libraries
!pip install pandas nltk
!pip install ipywidgets
!pip install spotipy
!pip install langdetect
!pip install sentence-transformers
!pip install spacy

# Importing libraries
import spacy
import re
from difflib import SequenceMatcher
import nltk
from nltk.tokenize import word_tokenize
from spacy.lang.en.stop_words import STOP_WORDS
import logging
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer

import pandas as pd
from IPython.display import display, Markdown
import ipywidgets as widgets

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

print("Setup complete. Ready to load data.")



[nltk_data] Downloading package punkt to /Users/anita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/anita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/anita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/anita/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Setup complete. Ready to load data.


##### Clean Data

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
# Load data from a CSV file
df = pd.read_csv('/Users/anita/Documents/NLPSongs-/music_data.csv')
print("Data loaded successfully!")
print("Sample data:")
df # Display the first few rows of the dataset to verify it's loaded correctly

Data loaded successfully!
Sample data:


Unnamed: 0,artist,title,id,release_date,lyrics
0,Ariana Grande,"​thank u, next",4063065,2018-11-03,[Verse 1]\nThought I'd end up with Sean\nBut h...
1,Ariana Grande,7 rings,4067762,2019-01-18,"[Verse 1]\nYeah, breakfast at Tiffany's and bo..."
2,Ariana Grande,​God is a woman,3681280,2018-07-13,"[Chorus]\nYou, you love it how I move you\nYou..."
3,Ariana Grande,Side To Side,2457495,2016-05-20,[Intro: Ariana Grande & Nicki Minaj]\nI've bee...
4,Ariana Grande,​​no tears left to cry,3649172,2018-04-20,"[Intro]\nRight now, I'm in a state of mind\nI ..."
...,...,...,...,...,...
3739,Taylor Swift,"Taylor Swift - no body, no crime (Traducción a...",6274728,2020-12-11,[Intro: HAIM]\nÉl lo hizo\nÉl lo hizo\n\n[Vers...
3740,Taylor Swift,Welcome Back Grunwald,6226864,,Turn WYCD on\nYou're\nOn your Grunwald\nBack f...
3741,Taylor Swift,Tolerate it (Polskie Tłumaczenie),6315848,2020-12-11,[Zwrotka 1]\nSiedzę i patrzę jak czytasz z gło...
3742,Taylor Swift,Find you,6209316,,Trying just like they say\nJust taking the ste...


In [4]:
null_values = df.isnull().sum()
print(null_values)

artist            0
title             0
id                0
release_date    993
lyrics            0
dtype: int64


In [5]:
#Remove null values
df= df.dropna()

In [6]:
duplicate_rows = df.duplicated().sum()
print(duplicate_rows)

0


In [7]:
from langdetect import detect

def is_english(row):
    for column in row:
        try:
            if detect(str(column)) != 'en':
                return False
        except:
            pass
    return True

mask = df.apply(is_english, axis=1)
df = df[mask]

In [8]:
print("Filtered and cleaned data:")
print(df.head())

Filtered and cleaned data:
           artist                  title       id release_date  \
517  Charlie Puth  We Don’t Talk Anymore  2388373   2016-05-24   
518  Charlie Puth               How Long  3236636   2017-10-05   
519  Charlie Puth            Marvin Gaye   706298   2015-02-10   
520  Charlie Puth          One Call Away  2276633   2015-08-20   
522  Charlie Puth           The Way I Am  3598344   2018-05-03   

                                                lyrics  
517  [Chorus: Charlie Puth]\nWe don't talk anymore,...  
518  [Intro]\nAlright\nOoh, yeah\n\n[Verse 1]\nI'll...  
519  [Intro: Charlie Puth]\nLet's Marvin Gaye and g...  
520  [Chorus]\nI'm only one call away\nI'll be ther...  
522  [Verse 1]\nYeah, maybe I'ma get a little anxio...  


#### Preprocess

In [9]:
# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

CONTRACTION_MAP = {
    "can't": "cannot",
    "won't": "will not",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'m": " am",
    "'ll": " will",
    "'d": " would",
    "'ve": " have",
    "o'clock": "of the clock",
    "ma'am": "madam",
    "let's": "let us"
}

def expand_contractions(text, contraction_map):
    # Regular expression for finding contractions
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_map.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_map.get(match.lower() if match.lower() in contraction_map else match)
        if expanded_contraction is None:
            return
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    return expanded_text

In [10]:
def preprocess_text(text):
    # Expand contractions
    text = expand_contractions(text, CONTRACTION_MAP)
    # Convert to lowercase
    text = text.lower()
    # Perform tokenization and lemmatization using spaCy
    doc = nlp(text)
    # Lemmatization and filtering out stop words and non-alphabetic characters
    lemmatized_tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(lemmatized_tokens)


# Apply preprocessing to the lyrics column
df['processed_lyrics'] = df['lyrics'].apply(preprocess_text)
print("Data preprocessing complete. Here's a preview:")
print(df[['lyrics', 'processed_lyrics']].head())

Data preprocessing complete. Here's a preview:
                                                lyrics  \
517  [Chorus: Charlie Puth]\nWe don't talk anymore,...   
518  [Intro]\nAlright\nOoh, yeah\n\n[Verse 1]\nI'll...   
519  [Intro: Charlie Puth]\nLet's Marvin Gaye and g...   
520  [Chorus]\nI'm only one call away\nI'll be ther...   
522  [Verse 1]\nYeah, maybe I'ma get a little anxio...   

                                      processed_lyrics  
517  chorus charlie puth donnot talk anymore donnot...  
518  intro alright ooh yeah verse admit wrong girl ...  
519  intro charlie puth let marvin gaye get healing...  
520  chorus away save day superman get away verse b...  
522  verse yeah maybe little anxious maybe little s...  


In [12]:
# Function to find the best match for the lyrics snippet in the dataset
def find_best_match(lyrics_input, df):
    def similarity(a, b):
        return SequenceMatcher(None, a, b).ratio()
    
    processed_input = preprocess_text(lyrics_input)
    df['similarity'] = df['processed_lyrics'].apply(lambda x: similarity(processed_input, x))
    best_match = df.loc[df['similarity'].idxmax()]
    print(f"Best match: {best_match['title']} by {best_match['artist']} with similarity score: {best_match['similarity']}")
    if best_match['similarity'] > 0.5:  # Threshold for similarity
        return best_match
    else:
        return None

#### Spotify Connection

In [13]:
# Import the Spotipy library
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Initialize Spotipy with Spotify Developer Credentials
client_id = '1075d2afec3d414ca02ec7002f1d5576'  # Replace with your actual Spotify Client ID
client_secret = 'bb1fd201730b484983d952ecbd344b33'  # Replace with your actual Spotify Client Secret

credentials = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
spotify = spotipy.Spotify(client_credentials_manager=credentials)

In [14]:
def search_spotify(song_title, artist):
    query = f"track:{song_title} artist:{artist}"
    results = spotify.search(q=query, type='track', limit=1)
    if results['tracks']['items']:
        track = results['tracks']['items'][0]
        print(f"Found on Spotify: {track['name']} by {track['artists'][0]['name']}")
        return {
            'name': track['name'],
            'artist': track['artists'][0]['name'],
            'album': track['album']['name'],
            'preview_url': track['preview_url'],
            'spotify_url': track['external_urls']['spotify']
        }
    else:
        return "No results found on Spotify."

In [15]:
# Function to retrieve and search Spotify based on the best match
def retrieve_song_and_search_spotify(lyrics_input):
    match = find_best_match(lyrics_input, df)
    if match is not None:
        print(f"Attempting to retrieve: {match['title']} by {match['artist']}")
        spotify_info = search_spotify(match['title'], match['artist'])
        display_spotify_result(spotify_info)
    else:
        display(Markdown("**No matching song found.**"))

In [16]:
from IPython.display import display, HTML, Audio

def display_spotify_result(spotify_info):
    if isinstance(spotify_info, dict):
        html_content = f"<h3>{spotify_info['name']} by {spotify_info['artist']}</h3>"
        html_content += f"<p><strong>Album:</strong> {spotify_info['album']}</p>"
        html_content += f"<a href='{spotify_info['spotify_url']}' target='_blank'>Listen on Spotify</a>"
        if spotify_info['preview_url']:
            display(HTML(html_content))
            display(Audio(spotify_info['preview_url'], autoplay=False))
        else:
            display(HTML(html_content))
    else:
        print(spotify_info)

#### Search

In [17]:
import ipywidgets as widgets
from IPython.display import display

# Input for lyrics
lyrics_input = widgets.Text(
    value='',
    placeholder='Type lyrics here...',
    description='Lyrics:',
    disabled=False
)

# Button to trigger the search
search_button = widgets.Button(
    description='Search',
    button_style='info',  # 'success', 'info', 'warning', 'danger' or ''
    icon='search'  # FontAwesome icon name (without 'fa-')
)

output = widgets.Output()

def on_button_clicked(b):
    with output:
        output.clear_output()
        retrieve_song_and_search_spotify(lyrics_input.value)

search_button.on_click(on_button_clicked)

display(lyrics_input, search_button, output)


Text(value='', description='Lyrics:', placeholder='Type lyrics here...')

Button(button_style='info', description='Search', icon='search', style=ButtonStyle())

Output()