In [3]:
import pandas as pd
import numpy as np
import random
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
random.seed(42)

artists = [
    "Arijit Singh", "Taylor Swift", "Ed Sheeran", "Adele",
    "Coldplay", "Imagine Dragons", "Shreya Ghoshal",
    "Atif Aslam", "The Weeknd", "Billie Eilish"
]

themes = [
    "love","heartbreak","dreams","night","stars","pain","hope","fire",
    "memories","freedom","lonely","together","forever","lost","found",
    "desire","light","dark","sky","rain","tears","smile","kiss","touch",
    "time","road","home","away","truth","lie"
]

def generate_lyrics(song_id):
    words = []
    for _ in range(60):
        words.append(random.choice(themes))
    words.append(f"song{song_id}")
    return " ".join(words)

data = []

for i in range(10000):
    song = f"Song {i}"
    artist = random.choice(artists)
    lyrics = generate_lyrics(i)
    data.append([song, artist, lyrics])

df = pd.DataFrame(data, columns=["song_title", "artist", "lyrics"])

print("Dataset recreated successfully")
print("Rows:", df.shape[0])
df.head()

Dataset recreated successfully
Rows: 10000


Unnamed: 0,song_title,artist,lyrics
0,Song 0,Taylor Swift,love touch memories fire fire stars touch nigh...
1,Song 1,Atif Aslam,dark night lie forever dreams dark freedom hom...
2,Song 2,Imagine Dragons,home time time heartbreak fire home heartbreak...
3,Song 3,Billie Eilish,found light memories dark away love smile touc...
4,Song 4,Coldplay,fire heartbreak fire truth sky dreams dreams t...


In [5]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

df["clean_lyrics"] = df["lyrics"].apply(preprocess_text)

df[["lyrics", "clean_lyrics"]].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,lyrics,clean_lyrics
0,love touch memories fire fire stars touch nigh...,love touch memories fire fire stars touch nigh...
1,dark night lie forever dreams dark freedom hom...,dark night lie forever dreams dark freedom hom...
2,home time time heartbreak fire home heartbreak...,home time time heartbreak fire home heartbreak...
3,found light memories dark away love smile touc...,found light memories dark away love smile touc...
4,fire heartbreak fire truth sky dreams dreams t...,fire heartbreak fire truth sky dreams dreams t...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

tfidf_matrix = vectorizer.fit_transform(df["clean_lyrics"])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (10000, 5000)


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_song(lyric_snippet):
    cleaned_input = preprocess_text(lyric_snippet)

    input_vector = vectorizer.transform([cleaned_input])

    similarity_scores = cosine_similarity(input_vector, tfidf_matrix)

    best_index = similarity_scores.argmax()

    return {
        "song_title": df.iloc[best_index]["song_title"],
        "artist": df.iloc[best_index]["artist"]
    }

In [8]:
test_snippet = "love night dreams stars hope"
result = predict_song(test_snippet)

print("Predicted Song:", result["song_title"])
print("Predicted Artist:", result["artist"])

Predicted Song: Song 9011
Predicted Artist: Ed Sheeran


In [9]:
def evaluate_top_k_accuracy(k=5, samples=200):
    correct = 0

    for idx in np.random.choice(len(df), samples, replace=False):
        snippet = df.iloc[idx]["lyrics"]
        cleaned = preprocess_text(snippet)

        input_vec = vectorizer.transform([cleaned])
        similarities = cosine_similarity(input_vec, tfidf_matrix)[0]

        top_k_indices = similarities.argsort()[-k:]
        actual_song = df.iloc[idx]["song_title"]

        if actual_song in df.iloc[top_k_indices]["song_title"].values:
            correct += 1

    return correct / samples

In [10]:
print("Top-5 Accuracy:", round(evaluate_top_k_accuracy() * 100, 2), "%")

Top-5 Accuracy: 100.0 %
