In [1]:
!pip install -r requirements.txt



In [2]:
import pandas as pd
import ast
import numpy as np
from ast import literal_eval
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, csr_matrix
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
df = pd.read_csv('data/cleaned_spotify_tracks.csv')

# Convert list-like strings to real lists
df['track_genre'] = df['track_genre'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

df = df.drop_duplicates(subset=['track_name', 'primary_artist'])
df = df.reset_index(drop=True)


In [10]:
df.head()

Unnamed: 0,track_id,artists,track_name,track_genre,explicit,popularity,danceability,energy,key,loudness,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,primary_artist,duration_sec,duration_min
0,0000vdREvCVMxbQTkS888c,['Rill'],Lolly,[german],1,44,0.91,0.374,8,-9.844,...,0.0757,0.00301,0.154,0.432,104.042,160725,4,Rill,160.725,2.67875
1,000CC8EParg64OmTxVnZ0p,['Glee Cast'],It's All Coming Back To Me Now (Glee Cast Vers...,[club],0,47,0.269,0.516,0,-7.361,...,0.406,0.0,0.117,0.341,178.174,322933,4,Glee Cast,322.933,5.382217
2,000Iz0K615UepwSJ5z2RE5,"['Paul Kalkbrenner', 'Pig&Dan']",Böxig Leise - Pig & Dan Remix,[minimal-techno],0,22,0.686,0.56,5,-13.264,...,0.00114,0.181,0.111,0.108,119.997,515360,4,Paul Kalkbrenner,515.36,8.589333
3,000RDCYioLteXcutOjeweY,['Jordan Sandhu'],Teeje Week,[hip-hop],0,62,0.679,0.77,0,-3.537,...,0.0583,0.0,0.0825,0.839,161.721,190203,4,Jordan Sandhu,190.203,3.17005
4,000qpdoc97IMTBvF8gwcpy,['Paul Kalkbrenner'],Tief,[minimal-techno],0,19,0.519,0.431,6,-13.606,...,0.000964,0.72,0.0916,0.234,129.971,331240,4,Paul Kalkbrenner,331.24,5.520667


In [11]:
required = ['track_id', 'track_name', 'primary_artist',
            'danceability', 'energy', 'loudness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo']


audio_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo'
]

# Standardize audio features
scaler = StandardScaler()
scaled_audio = scaler.fit_transform(df[audio_features])
scaled_audio = csr_matrix(scaled_audio)

In [17]:
# Convert multi-label genres into a single primary genre
def pick_first(x):
    if isinstance(x, (list, tuple)) and len(x) > 0:
        return x[0]
    return x

# Create a new column that picks the first genre from the list (if multiple)
df['genre_single'] = df['track_genre'].apply(pick_first)

# Numeric & categorical columns
numeric_cols = ['danceability', 'energy', 'valence', 'tempo', 'loudness']
cat_cols = ['genre_single', 'primary_artist']

# Numeric matrix
num_matrix = df[numeric_cols].values

# One-hot encode all genres & artists
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
cat_matrix = ohe.fit_transform(df[cat_cols])

# Final sparse feature matrix
X = hstack([num_matrix, cat_matrix], format='csr')

In [19]:
def recommend(song_title, artist_name, df, X, top_k=10):
    # Find the exact song’s index using both title and artist
    idx_list = df.index[
        (df['track_name'] == song_title) &
        (df['primary_artist'] == artist_name)
    ].tolist()

    if len(idx_list) == 0:
        return None

    idx = idx_list[0]

    # Compute similarity of this one song vs all songs
    sim_scores = cosine_similarity(X[idx], X).flatten()

    # Sort by similarity (skip itself)
    top_indices = sim_scores.argsort()[::-1][1:top_k+1]

    return df[['track_name', 'primary_artist', 'genre_single']].iloc[top_indices]


In [None]:
# Test the recommendation function
print("Recommendations for 'Blinding Lights' by 'All Time Low':")
print(recommend("Blinding Lights", "All Time Low", df, X, top_k=5))

print("\nRecommendations for 'Levitating' by 'Dua Lipa':")
print(recommend("Levitating", "Dua Lipa", df, X, top_k=5))





Recommendations for 'Blinding Lights' by 'All Time Low':
                    track_name    primary_artist genre_single
3520   Dear Maria, Count Me In      All Time Low         punk
47720             Sleepwalking      All Time Low         punk
53777          Pacarku Siluman  Stand Here Alone         punk
7688              The Violence      Rise Against         punk
24759      10 Tahun di Barisan   Over Distortion         punk

Recommendations for 'Levitating' by 'Dua Lipa':
                             track_name   primary_artist genre_single
54901                          Physical         Dua Lipa        dance
3109   Physical - Erika de Casier Remix         Dua Lipa        dance
198                      Break My Heart         Dua Lipa        dance
32419                 Super Freaky Girl      Nicki Minaj        dance
40836                           Pump It  Black Eyed Peas        dance
