In [1]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
df = pd.read_csv("../data/lyrics.csv")

# keep required columns only
df = df[['artist', 'song', 'text']]

# remove empty rows
df.dropna(inplace=True)

df.head()


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['clean_lyrics'] = df['text'].apply(preprocess)
df[['song', 'clean_lyrics']].head()


Unnamed: 0,song,clean_lyrics
0,Ahe's My Kind Of Girl,look face wonderful face mean something specia...
1,"Andante, Andante",take easy please touch gently like summer even...
2,As Good As New,ill never know go put lousy rotten show boy to...
3,Bang,making somebody happy question give take learn...
4,Bang-A-Boomerang,making somebody happy question give take learn...


In [5]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(df['clean_lyrics'])
X.shape


(57650, 5000)

In [6]:
def predict_song(snippet):
    snippet = preprocess(snippet)
    snippet_vector = vectorizer.transform([snippet])
    similarity_scores = cosine_similarity(snippet_vector, X)
    best_index = similarity_scores.argmax()
    return df.iloc[best_index][['song', 'artist']]


In [7]:
snippet = "hello darkness my old friend"
result = predict_song(snippet)

print("Predicted Song :", result['song'])
print("Predicted Artist :", result['artist'])


Predicted Song : Hello My Old Friend
Predicted Artist : Electric Light Orchestra


In [8]:
correct = 0
tests = 50

for _ in range(tests):
    sample = df.sample(1)
    snippet = sample['text'].values[0][:120]
    prediction = predict_song(snippet)
    if prediction['song'] == sample['song'].values[0]:
        correct += 1

print("Accuracy:", correct / tests)


Accuracy: 0.38
