In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from itertools import combinations
import time
import os
import inflect 
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

In [None]:
df = pd.read_csv("../data/cleaned_spotify_data_with_artist_id.csv")

In [4]:
# Setting options: all columns 

pd.set_option('display.max_columns', None)
df.head(3)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,duration_sec,month,artist_id
0,0.995,Carl Woitschach,0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.051,118.469,0.779,1928,158.648,,4580
1,0.994,"Robert Schumann, Vladimir Horowitz",0.379,282133,0.013,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.076,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.046,83.972,0.077,1928,282.133,,25082
2,0.604,Seweryn Goszczyński,0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928,104.3,,26283


In [5]:
# Identify duplicate songs

duplicates = df[df.duplicated(subset=['name', 'artists'])]
duplicates

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,duration_sec,month,artist_id
70,0.974,"Frédéric Chopin, Vladimir Horowitz",0.188,236493,0.089,0,6tD8hbtg9YvauF84fl15sd,0.896,0,0.124,-25.936,1,"Andante spianato in E-Flat Major, Op. 22",3,1928,0.033,59.946,0.164,1928,236.493,,10004
83,0.992,"Frédéric Chopin, Vladimir Horowitz",0.301,785427,0.082,0,71FaVeFy9ZOiQRY4yOijey,0.852,1,0.087,-23.282,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",0,1928,0.038,137.296,0.055,1928,785.427,,10004
91,0.991,"Frédéric Chopin, Vladimir Horowitz",0.245,340973,0.058,0,74rczPEWk2w3Fl8eENpzuf,0.882,5,0.070,-24.980,0,"Nocturne F Minor, Op. 55, No. 1",0,1928,0.033,74.004,0.105,1928,340.973,,10004
469,0.977,Charlie Chaplin,0.530,193824,0.209,0,4qmz6OTEv1FMZPtlE4TCWJ,0.604,10,0.115,-10.901,0,"The Lone Prospector, Big Jim & Black Larsen",1,1942,0.053,109.746,0.519,1942,193.824,,5010
609,0.983,"Sergei Rachmaninoff, William Kapell, Fritz Reiner",0.635,61467,0.012,0,2qYaSlK0698oGi7Z9yrixw,0.955,0,0.107,-27.297,1,"Rhapsody on a Theme of Paganini, Op.43: Variat...",0,1945,0.052,83.046,0.610,1945,61.467,,26252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169878,0.574,Porte Diferente,0.811,228466,0.580,0,2OU13apOMaGALVlSb2PzPi,0.000,6,0.075,-4.776,0,Por Tu Culpa,64,2020-02-26,0.065,79.952,0.659,2020,228.466,2.0,23476
169886,0.024,Lil Uzi Vert,0.775,234627,0.720,1,1Xd0PWZV3LCEGzc7fkjTeu,0.000,11,0.114,-5.353,0,P2,66,2020-03-13,0.193,155.086,0.490,2020,234.627,3.0,17694
169897,0.022,Future,0.854,201907,0.599,1,05TpY5Fov3Hgfp8V1KILZ8,0.000,1,0.173,-7.406,0,Tycoon,66,2020-05-15,0.355,157.031,0.849,2020,201.907,5.0,10041
169902,0.023,"Trey Songz, Summer Walker",0.619,194576,0.719,1,5QZ11AHm7xiytOGXGlxQi5,0.000,0,0.084,-4.111,1,Back Home (feat. Summer Walker),69,2020-04-29,0.157,86.036,0.351,2020,194.576,4.0,30611


In [6]:
# Drop duplicates and save new CSV file
df = df.drop_duplicates(subset=['name', 'artists'])
df.to_csv('cleaned_no_duplicates.csv', index=False)

In [7]:
# Features to be included: acousticness danceability energy explicit instrumentalness liveness tempo year
columns_to_extract = [
   'artists','name', 'acousticness', 'danceability',
    'explicit', 'instrumentalness', 'liveness',
    'tempo', 'energy','year'
]

new_df = df[columns_to_extract].copy()

### Natural Language Pre-Processing

In [8]:
def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Replace numeric tokens with words
    text = re.sub(r'\b\d+\b', lambda match: p.number_to_words(match.group()), text)
    
    # Tokenize words
    tokens = word_tokenize(text)
    
    # Load stopwords
    stop_words = set(stopwords.words("english"))
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize and filter tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    
    return tokens

In [9]:
# Apply the preprocessing function to the column 'name'

p = inflect.engine()

new_df['name_tokens'] = new_df['name'].apply(preprocess)

new_df['name_tokens']

0                         [singende, bataillone, one, teil]
1         [fantasiestücke, op, one, hundred, eleven, più...
2         [chapter, one, hundred, eighteen, zamek, kanio...
3             [bebamos, junto, instrumental, remasterizado]
4                    [polonaisefantaisie, aflat, major, op]
                                ...                        
169903                                  [ojos, de, maniaco]
169904                        [skechers, feat, tyga, remix]
169905                     [sweeter, feat, terrace, martin]
169906                                        [would, know]
169907                                              [found]
Name: name_tokens, Length: 156608, dtype: object

In [10]:
# Join tokens back into strings

new_df['name_cleaned'] = new_df['name_tokens'].apply(lambda tokens: ' '.join(tokens))

# Initialize TF-IDF Vectorizer

tfidf = TfidfVectorizer(max_features=100)

# Fit and transform

name_embeddings = tfidf.fit_transform(new_df['name_cleaned'])

# Convert to array if needed

name_embeddings = name_embeddings.toarray()

In [11]:
# Apply the preprocessing function to the column 'artists'

p = inflect.engine()

new_df['artists_tokens'] = new_df['artists'].apply(preprocess)

new_df['artists_tokens']

0                             [carl, woitschach]
1         [robert, schumann, vladimir, horowitz]
2                         [seweryn, goszczyński]
3                            [francisco, canaro]
4         [frédéric, chopin, vladimir, horowitz]
                           ...                  
169903                [legado, seven, junior, h]
169904                        [dripreport, tyga]
169905           [leon, bridge, terrace, martin]
169906                        [kygo, oh, wonder]
169907               [cash, cash, andy, grammer]
Name: artists_tokens, Length: 156608, dtype: object

In [12]:
# Join tokens back into strings

new_df['artists_cleaned'] = new_df['artists_tokens'].apply(lambda tokens: ' '.join(tokens))

# Initialize TF-IDF Vectorizer

tfidf = TfidfVectorizer(max_features=100)

# Fit and transform

artists_embeddings = tfidf.fit_transform(new_df['artists_cleaned'])

# Convert to array if needed

artists_embeddings = artists_embeddings.toarray()

### Song Embeddings

In [13]:
# Standardize numeric features
song_features = new_df[['acousticness', 'danceability',
                        'explicit', 'instrumentalness', 'liveness',
                        'tempo', 'energy', 'year']]

scaler = StandardScaler()

other_features_scaled = scaler.fit_transform(song_features)

In [14]:
X = np.hstack((artists_embeddings,name_embeddings,other_features_scaled))

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Convert data to tensors
X_tensor = torch.tensor(X, dtype=torch.float32)

# Define a neural network
class SongEmbedder(nn.Module):
    def __init__(self, input_dim, embedding_dim=64):
        super(SongEmbedder, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, embedding_dim)  # Final embedding layer

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x  # Song embedding

# Initialize model
input_dim = X.shape[1]
model = SongEmbedder(input_dim=input_dim, embedding_dim=64)

# Forward pass to get embeddings
with torch.no_grad():
    song_embeddings = model(X_tensor)  # Shape: [num_songs, 64]

In [16]:
song_embeddings.shape

torch.Size([156608, 64])

### Similarity Fucntion

In [17]:
# Song of interest (1-based index)
query_index = 7748

# Convert to 0-based index
actual_index = query_index - 1

# Compute cosine similarity matrix
similarities = cosine_similarity([song_embeddings[actual_index]], song_embeddings)[0]

# Get indices of top 10 most similar songs (excluding the song itself)
top_indices = similarities.argsort()[::-1][1:11]

# Print results
query_song = new_df.iloc[actual_index]
print(f"Songs similar to: {query_song['name']} by {query_song['artists']}\n")

for i, idx in enumerate(top_indices, start=1):
    song = new_df.iloc[idx]
    print(f"{i}. {song['name']} by {song['artists']} (Similarity Score: {similarities[idx]:.3f})")

Songs similar to: What Do I Know? by Ed Sheeran

1. We Know by Lin-Manuel Miranda, Daveed Diggs, Leslie Odom Jr., Okieriete Onaodowan (Similarity Score: 0.976)
2. Didn't Cha Know by Erykah Badu (Similarity Score: 0.974)
3. LoveStoned / I Think She Knows Interlude by Justin Timberlake (Similarity Score: 0.973)
4. Stella Brown by Jelani Aryeh (Similarity Score: 0.969)
5. Text Me by DPR LIVE (Similarity Score: 0.968)
6. Know Your Worth by Khalid, Disclosure (Similarity Score: 0.967)
7. California by Anthony Russo (Similarity Score: 0.967)
8. I Know Alone by HAIM (Similarity Score: 0.966)
9. Sleepless - Radio Edit by CAZZETTE, The High (Similarity Score: 0.966)
10. Coldest Winter by Kanye West (Similarity Score: 0.965)


### Precision@K Evaluation

We evaluate how many of the top 10 recommended songs are relevant by checking if their cosine similarity with the query song exceeds 0.95. This high threshold ensures only strongly similar songs are considered as relevant. Here, 100 songs are randomly selected as queries and calculated the average Precision@10 score to assess the system’s recommendation quality.

In [18]:
def precision_at_k(song_index, song_embeddings, k=10, threshold=0.95):
    # Compute cosine similarity matrix to all songs
    similarities = cosine_similarity([song_embeddings[song_index]], song_embeddings)[0]
    
    # Obtain indices of top k+1 similar songs, excluding the song itself
    top_k_indices = similarities.argsort()[::-1][1:k+1]
    
    # Count how many of the top k are relevant and above threshold of 0.95 
    relevant_count = sum(similarities[i] >= threshold for i in top_k_indices)
    
    return relevant_count / k

# Repeat for multiple random songs 
import random

# Try for 100 random songs 
num_samples = 100 
k = 10
threshold = 0.95
random_indices = random.sample(range(len(song_embeddings)), num_samples)

# Compute Precision@10 on average
precision_scores = [
    precision_at_k(i, song_embeddings, k=k, threshold=threshold)
    for i in random_indices
]

average_precision_at_10 = sum(precision_scores) / len(precision_scores)
print(f"Average Precision@{k} across 100 random songs: {average_precision_at_10:.3f}")

Average Precision@10 across 100 random songs: 0.993


The Precision@10 score of 0.993 shows that almost all top 10 recommended songs are highly similar to the query songs. 

Next, we increase the number of randomly selected query songs from 100 to 1000 to enhance the statistical confidence of the average Precision@10 score.

In [19]:
def precision_at_k(song_index, song_embeddings, k=10, threshold=0.95):
    # Compute cosine similarity matrix to all songs
    similarities = cosine_similarity([song_embeddings[song_index]], song_embeddings)[0]
    
    # Obtain indices of top k+1 similar songs, excluding the song itself
    top_k_indices = similarities.argsort()[::-1][1:k+1]
    
    # Count how many of the top k are relevant and above threshold of 0.95 
    relevant_count = sum(similarities[i] >= threshold for i in top_k_indices)
    
    return relevant_count / k

# Repeat for multiple random songs 
import random

# Try for 1000 random songs 
num_samples = 1000 
k = 10
threshold = 0.95
random_indices = random.sample(range(len(song_embeddings)), num_samples)

# Compute Precision@10 on average
precision_scores = [
    precision_at_k(i, song_embeddings, k=k, threshold=threshold)
    for i in random_indices
]

average_precision_at_10 = sum(precision_scores) / len(precision_scores)
print(f"Average Precision@{k} across 1000 random songs: {average_precision_at_10:.3f}")

Average Precision@10 across 1000 random songs: 0.980


The Precision@10 score of 0.980 suggests that the system maintains high recommendation quality even with broader evaluation. Therefore, these results indicate strong performance and robustness of the song recommendation model. 