In [1]:
# Standard library
import json
from typing import Any

# Data manipulation
import pandas as pd
import numpy as np

# Machine learning
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors

In [2]:
with open('songs.json', 'r', encoding='utf-8') as f:
    songs = json.load(f)

print('Total songs:', len(songs))
print('First song:', songs[0])

Total songs: 14476
First song: {'_id': {'$oid': '696bdc61eaeecc7f4fb9955e'}, 'track_id': '675', 'artist': 'Fanatic', 'title': 'Enterprizin', 'genre': ['Hip-Hop'], 'audio_feature': {'acousticness': 0.8129947339, 'danceability': 0.2560540436, 'energy': 0.5513755529, 'instrumentalness': 1.6808e-06, 'liveness': 0.3418534428, 'speechiness': 0.0646946268, 'tempo': 66.043, 'valence': 0.6977514338}, 's3_url': 's3://music-bucket/tracks/675.mp3'}


In [3]:
# Show the first 4 rows (songs) in the data
for i, song in enumerate(songs[:4], start=1):
    print(f"Row {i}:")
    print(song)
    print("-" * 40)

Row 1:
{'_id': {'$oid': '696bdc61eaeecc7f4fb9955e'}, 'track_id': '675', 'artist': 'Fanatic', 'title': 'Enterprizin', 'genre': ['Hip-Hop'], 'audio_feature': {'acousticness': 0.8129947339, 'danceability': 0.2560540436, 'energy': 0.5513755529, 'instrumentalness': 1.6808e-06, 'liveness': 0.3418534428, 'speechiness': 0.0646946268, 'tempo': 66.043, 'valence': 0.6977514338}, 's3_url': 's3://music-bucket/tracks/675.mp3'}
----------------------------------------
Row 2:
{'_id': {'$oid': '696bdc61eaeecc7f4fb9955f'}, 'track_id': '691', 'artist': 'Flying Luttenbachers', 'title': 'Into the Vastness of Stupidity', 'genre': ['Rock'], 'audio_feature': {'acousticness': 0.1144486056, 'danceability': 0.34362392, 'energy': 0.7073603325, 'instrumentalness': 0.6995872654, 'liveness': 0.0994395174, 'speechiness': 0.0838322046, 'tempo': 128.063, 'valence': 0.0860083717}, 's3_url': 's3://music-bucket/tracks/691.mp3'}
----------------------------------------
Row 3:
{'_id': {'$oid': '696bdc61eaeecc7f4fb99560'}, '

In [4]:
# Convert list of song dicts to a table (DataFrame)
data = pd.json_normalize(songs)

# Show the first 10 rows as a table
data.head(10)

Unnamed: 0,track_id,artist,title,genre,s3_url,_id.$oid,audio_feature.acousticness,audio_feature.danceability,audio_feature.energy,audio_feature.instrumentalness,audio_feature.liveness,audio_feature.speechiness,audio_feature.tempo,audio_feature.valence
0,675,Fanatic,Enterprizin,[Hip-Hop],s3://music-bucket/tracks/675.mp3,696bdc61eaeecc7f4fb9955e,0.812995,0.256054,0.551376,2e-06,0.341853,0.064695,66.043,0.697751
1,691,Flying Luttenbachers,Into the Vastness of Stupidity,[Rock],s3://music-bucket/tracks/691.mp3,696bdc61eaeecc7f4fb9955f,0.114449,0.343624,0.70736,0.699587,0.09944,0.083832,128.063,0.086008
2,829,Here Comes A Big Black Cloud!!,The Fly Pt. II,"[Rock, Loud-Rock, Psych-Rock, Indie-Rock]",s3://music-bucket/tracks/829.mp3,696bdc61eaeecc7f4fb99560,0.860667,0.116754,0.725228,0.930792,0.081767,0.095631,156.887,0.039407
3,1090,Mahjongg,silver series (MDG remix),[Electronic],s3://music-bucket/tracks/1090.mp3,696bdc61eaeecc7f4fb99561,0.024185,0.52831,0.617582,0.939297,0.109367,0.096772,80.775,0.961668
4,1159,Mors Ontologica,Lovesick,[Rock],s3://music-bucket/tracks/1159.mp3,696bdc61eaeecc7f4fb99562,0.348834,0.40499,0.990642,0.931408,0.081932,0.117265,126.169,0.221862
5,1436,Pleasurehorse,Pearly Gates pt2,[Electronic],s3://music-bucket/tracks/1436.mp3,696bdc61eaeecc7f4fb99563,0.051319,0.4117,0.983239,0.941505,0.162668,0.631843,241.549,0.659187
6,2069,caUSE co-MOTION,stop standing still,[Punk],s3://music-bucket/tracks/2069.mp3,696bdc61eaeecc7f4fb99564,0.913924,0.431269,0.703222,0.968096,0.207805,0.046243,63.913,0.441134
7,3606,Emma Peel,,[Indie-Rock],s3://music-bucket/tracks/3606.mp3,696bdc61eaeecc7f4fb99565,0.990874,0.599643,0.757628,0.042152,0.047686,0.085064,141.089,0.515923
8,7762,junior85,01 raymondscott.mp3,[Electronic],s3://music-bucket/tracks/7762.mp3,696bdc61eaeecc7f4fb99566,0.723667,0.400863,0.736841,0.920959,0.316492,0.075762,130.38,0.888154
9,9993,50 Foot Wave,Petal,"[Rock, Indie-Rock]",s3://music-bucket/tracks/9993.mp3,696bdc61eaeecc7f4fb99567,0.073917,0.162641,0.938101,0.787355,0.116743,0.051659,161.253,0.36743


In [5]:
# One-hot encode the 'genre' list column into separate columns

# Ensure every value in 'genre' is a list (replace None / missing with empty list)
data['genre'] = data['genre'].apply(lambda x: x if isinstance(x, list) else [])

# Collect all unique genres
all_genres = sorted({g for genre_list in data['genre'] for g in genre_list})

for g in all_genres:
    col_name = f"genre_{g}"
    data[col_name] = data['genre'].apply(lambda genre_list: int(g in genre_list))

# Optionally drop original 'genre' column if you only want one-hot columns
# data = data.drop(columns=['genre'])

# Show first 10 rows with new one-hot columns
data.head(10)

  data[col_name] = data['genre'].apply(lambda genre_list: int(g in genre_list))


Unnamed: 0,track_id,artist,title,genre,s3_url,_id.$oid,audio_feature.acousticness,audio_feature.danceability,audio_feature.energy,audio_feature.instrumentalness,...,genre_Soundtrack,genre_Space-Rock,genre_Spanish,genre_Surf,genre_Synth Pop,genre_Techno,genre_Thrash,genre_Trip-Hop,genre_Western Swing,genre_Wonky
0,675,Fanatic,Enterprizin,[Hip-Hop],s3://music-bucket/tracks/675.mp3,696bdc61eaeecc7f4fb9955e,0.812995,0.256054,0.551376,2e-06,...,0,0,0,0,0,0,0,0,0,0
1,691,Flying Luttenbachers,Into the Vastness of Stupidity,[Rock],s3://music-bucket/tracks/691.mp3,696bdc61eaeecc7f4fb9955f,0.114449,0.343624,0.70736,0.699587,...,0,0,0,0,0,0,0,0,0,0
2,829,Here Comes A Big Black Cloud!!,The Fly Pt. II,"[Rock, Loud-Rock, Psych-Rock, Indie-Rock]",s3://music-bucket/tracks/829.mp3,696bdc61eaeecc7f4fb99560,0.860667,0.116754,0.725228,0.930792,...,0,0,0,0,0,0,0,0,0,0
3,1090,Mahjongg,silver series (MDG remix),[Electronic],s3://music-bucket/tracks/1090.mp3,696bdc61eaeecc7f4fb99561,0.024185,0.52831,0.617582,0.939297,...,0,0,0,0,0,0,0,0,0,0
4,1159,Mors Ontologica,Lovesick,[Rock],s3://music-bucket/tracks/1159.mp3,696bdc61eaeecc7f4fb99562,0.348834,0.40499,0.990642,0.931408,...,0,0,0,0,0,0,0,0,0,0
5,1436,Pleasurehorse,Pearly Gates pt2,[Electronic],s3://music-bucket/tracks/1436.mp3,696bdc61eaeecc7f4fb99563,0.051319,0.4117,0.983239,0.941505,...,0,0,0,0,0,0,0,0,0,0
6,2069,caUSE co-MOTION,stop standing still,[Punk],s3://music-bucket/tracks/2069.mp3,696bdc61eaeecc7f4fb99564,0.913924,0.431269,0.703222,0.968096,...,0,0,0,0,0,0,0,0,0,0
7,3606,Emma Peel,,[Indie-Rock],s3://music-bucket/tracks/3606.mp3,696bdc61eaeecc7f4fb99565,0.990874,0.599643,0.757628,0.042152,...,0,0,0,0,0,0,0,0,0,0
8,7762,junior85,01 raymondscott.mp3,[Electronic],s3://music-bucket/tracks/7762.mp3,696bdc61eaeecc7f4fb99566,0.723667,0.400863,0.736841,0.920959,...,0,0,0,0,0,0,0,0,0,0
9,9993,50 Foot Wave,Petal,"[Rock, Indie-Rock]",s3://music-bucket/tracks/9993.mp3,696bdc61eaeecc7f4fb99567,0.073917,0.162641,0.938101,0.787355,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data = data.set_index('track_id')

In [7]:
len(data.columns)

136

In [8]:
# ── STEP 1: Data Validation ───────────────────────────────────────────────────
print("=" * 50)
print("STEP 1: DATA VALIDATION")
print("=" * 50)

# ── 1.1. Drop irrelevant columns ───────────────────────────────────
print("\n1.1. Checking for irrelevant columns...")
columns_to_drop = []
# Note: We preserve '_id.$oid' column for recommendations, only drop '_id' if it exists separately
if '_id' in data.columns and '_id.$oid' not in data.columns:
    columns_to_drop.append('_id')
if 's3_url' in data.columns:
    columns_to_drop.append('s3_url')

if columns_to_drop:
    print(f"   Dropping columns: {columns_to_drop}")
    data = data.drop(columns=columns_to_drop)
    print(f"   ✅ Dropped {len(columns_to_drop)} column(s)")
else:
    print("   ✅ No irrelevant columns found")

# ── 1.2. Null Value Check ───────────────────────────────────────────────────────
print("\n1.2. Checking for null values...")
null_counts = data.isnull().sum()
null_cols = null_counts[null_counts > 0]

if null_cols.empty:
    print("   ✅ No null values found in any column.")
else:
    print(f"   ⚠️  Found null values in {len(null_cols)} column(s):")
    print(null_cols.to_string())

print(f"\n   Total nulls across entire dataset: {data.isnull().sum().sum()}")

# ── 1.3. Ensure numerical feature columns are numeric dtype ────────────────────
print("\n1.3. Ensuring numerical columns are numeric dtype...")
audio_feature_cols = [col for col in data.columns if col.startswith('audio_feature.')]
for col in audio_feature_cols:
    if data[col].dtype != 'float64' and data[col].dtype != 'int64':
        try:
            data[col] = pd.to_numeric(data[col], errors='coerce')
            print(f"   Converted {col} to numeric")
        except:
            print(f"   ⚠️  Could not convert {col} to numeric")

# ── 1.4. Duplicate Row Check ────────────────────────────────────────────────────
print("\n1.4. Checking for duplicate rows...")
# Identify columns that contain lists (unhashable types)
list_columns = []
for col in data.columns:
    sample_values = data[col].dropna().head(100)
    if len(sample_values) > 0 and any(isinstance(val, list) for val in sample_values):
        list_columns.append(col)

columns_to_check = [col for col in data.columns if col not in list_columns]

if list_columns:
    print(f"   ⚠️  Excluding {len(list_columns)} list column(s) from duplicate check: {list_columns}")

if len(columns_to_check) > 0:
    num_duplicates = data[columns_to_check].duplicated().sum()
    
    if num_duplicates == 0:
        print("   ✅ No duplicate rows found.")
    else:
        print(f"   ⚠️  Found {num_duplicates} duplicate row(s). Removing them...")
        data = data.drop_duplicates(subset=columns_to_check)
        print(f"   ✅ Duplicates removed. Dataset now has {len(data)} rows.")
else:
    print("   ⚠️  All columns contain lists. Skipping duplicate check.")

# ── 1.5. Summary ────────────────────────────────────────────────────────────────
print("\n" + "=" * 50)
print("DATASET SUMMARY AFTER VALIDATION")
print("=" * 50)
print(f"Rows    : {data.shape[0]}")
print(f"Columns : {data.shape[1]}")
print(f"Memory  : {data.memory_usage(deep=True).sum() / 1024:.1f} KB")

STEP 1: DATA VALIDATION

1.1. Checking for irrelevant columns...
   Dropping columns: ['s3_url']
   ✅ Dropped 1 column(s)

1.2. Checking for null values...
   ⚠️  Found null values in 5 column(s):
artist                          4
title                         417
audio_feature.danceability     26
audio_feature.speechiness     203
audio_feature.valence          28

   Total nulls across entire dataset: 678

1.3. Ensuring numerical columns are numeric dtype...

1.4. Checking for duplicate rows...
   ⚠️  Excluding 1 list column(s) from duplicate check: ['genre']
   ✅ No duplicate rows found.

DATASET SUMMARY AFTER VALIDATION
Rows    : 14476
Columns : 135
Memory  : 20294.4 KB


In [9]:
# ── STEP 2: Feature Selection ───────────────────────────────────────────────────
print("=" * 50)
print("STEP 2: FEATURE SELECTION")
print("=" * 50)

# Select only numerical audio feature columns
# Exclude: 'track_id', 'artist', 'title', 'genre', 'cluster', and genre one-hot columns

# Get all columns to exclude
exclude_cols = ['track_id', 'artist', 'title', 'genre', 'cluster', '_id', 's3_url']
# Also exclude genre one-hot encoded columns
genre_cols = [col for col in data.columns if col.startswith('genre_')]
exclude_cols.extend(genre_cols)

# Select only numerical columns
all_numeric = data.select_dtypes(include=[np.number])

# Filter out excluded columns
X = all_numeric[[col for col in all_numeric.columns if col not in exclude_cols]]

print(f"Selected {X.shape[1]} numerical audio feature columns")
print(f"Feature columns: {list(X.columns)}")
print(f"Shape: {X.shape}")

# ── KMeans Clustering (for cluster-based recommendations) ───────────────────────
print("\n" + "=" * 50)
print("CLUSTERING WITH KMEANS")
print("=" * 50)

# KMeans cannot handle NaN values, so we impute them first
cluster_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=40, random_state=42))
])

cluster_pipeline.fit(X)
data['cluster'] = cluster_pipeline.predict(X)

print(f"✅ Clustering complete. Created {data['cluster'].nunique()} clusters")
print(f"Cluster distribution:\n{data['cluster'].value_counts().sort_index().head(10)}")

# Inspect the first 10 rows with cluster labels
data[['artist', 'title', 'cluster']].head(10)

STEP 2: FEATURE SELECTION
Selected 8 numerical audio feature columns
Feature columns: ['audio_feature.acousticness', 'audio_feature.danceability', 'audio_feature.energy', 'audio_feature.instrumentalness', 'audio_feature.liveness', 'audio_feature.speechiness', 'audio_feature.tempo', 'audio_feature.valence']
Shape: (14476, 8)

CLUSTERING WITH KMEANS


✅ Clustering complete. Created 40 clusters
Cluster distribution:
cluster
0    664
1    582
2    259
3    447
4    236
5    487
6    474
7    336
8     86
9    569
Name: count, dtype: int64


  data['cluster'] = cluster_pipeline.predict(X)


Unnamed: 0_level_0,artist,title,cluster
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
675,Fanatic,Enterprizin,11
691,Flying Luttenbachers,Into the Vastness of Stupidity,20
829,Here Comes A Big Black Cloud!!,The Fly Pt. II,3
1090,Mahjongg,silver series (MDG remix),0
1159,Mors Ontologica,Lovesick,27
1436,Pleasurehorse,Pearly Gates pt2,12
2069,caUSE co-MOTION,stop standing still,25
3606,Emma Peel,,2
7762,junior85,01 raymondscott.mp3,25
9993,50 Foot Wave,Petal,20


In [10]:
# ── STEP 3: Feature Scaling ────────────────────────────────────────────────────
print("=" * 50)
print("STEP 3: FEATURE SCALING")
print("=" * 50)

# Handle NaN values before scaling
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

print(f"✅ Features scaled successfully")
print(f"Original shape: {X.shape}")
print(f"Scaled shape: {X_scaled.shape}")
print(f"Scaled feature statistics:")
print(f"  Mean: {X_scaled.mean(axis=0)[:5]}... (should be ~0)")
print(f"  Std: {X_scaled.std(axis=0)[:5]}... (should be ~1)")


STEP 3: FEATURE SCALING
✅ Features scaled successfully
Original shape: (14476, 8)
Scaled shape: (14476, 8)
Scaled feature statistics:
  Mean: [6.28277633e-17 2.35604112e-16 3.14138817e-17 8.63881746e-17
 9.22782774e-17]... (should be ~0)
  Std: [1. 1. 1. 1. 1.]... (should be ~1)


In [11]:
# ── STEP 4: Train KNN Model ─────────────────────────────────────────────────────
print("=" * 50)
print("STEP 4: TRAIN KNN MODEL")
print("=" * 50)

# Initialize KNN with cosine metric for scalable similarity search
knn_model = NearestNeighbors(
    metric='cosine',
    algorithm='auto',
    n_neighbors=50  # We'll use more neighbors than needed, then filter by cluster
)

# Fit model on scaled features
knn_model.fit(X_scaled)

print("✅ KNN model trained successfully")
print(f"Model parameters:")
print(f"  Metric: {knn_model.metric}")
print(f"  Algorithm: {knn_model.algorithm}")
print(f"  Number of neighbors: {knn_model.n_neighbors}")
print(f"  Data shape: {X_scaled.shape}")


STEP 4: TRAIN KNN MODEL
✅ KNN model trained successfully
Model parameters:
  Metric: cosine
  Algorithm: auto
  Number of neighbors: 50
  Data shape: (14476, 8)


In [12]:
# ── STEP 5: Recommendation Function ─────────────────────────────────────────────
print("=" * 50)
print("STEP 5: RECOMMENDATION FUNCTION")
print("=" * 50)

def recommend_songs(song_title, n=5):
    """
    Recommend songs based on a given song title using KNN within the same cluster.
    
    Parameters:
    -----------
    song_title : str
        Title of the song to find recommendations for
    n : int, default=5
        Number of recommendations to return
    
    Returns:
    --------
    pd.DataFrame
        DataFrame containing:
        - $oid: MongoDB ObjectId
        - title: Song title
        - artist: Artist name
        - genre: Genre list
        - similarity_score: Cosine similarity (1 - cosine distance)
    
    Raises:
    -------
    ValueError
        If song is not found in the dataset
    """
    # Locate the song index in data
    song_mask = data['title'].str.lower() == song_title.lower()
    song_indices = data[song_mask].index.tolist()
    
    if len(song_indices) == 0:
        raise ValueError(f"Song '{song_title}' not found in the dataset")
    
    # If multiple songs with same title, use the first one
    song_idx = song_indices[0]
    
    # Get the cluster of the input song
    song_cluster = data.loc[song_idx, 'cluster']
    
    # Get the position in the dataframe (for indexing X_scaled)
    song_position = data.index.get_loc(song_idx)
    
    # Find nearest neighbors (get more than needed to account for cluster filtering)
    distances, indices = knn_model.kneighbors(
        X_scaled[song_position:song_position+1],
        n_neighbors=min(100, len(X_scaled))  # Get up to 100 neighbors
    )
    
    # Filter neighbors to same cluster and exclude the input song itself
    neighbor_indices = indices[0]
    neighbor_distances = distances[0]
    
    # Map back to dataframe indices and filter by cluster
    filtered_recommendations = []
    for idx, dist in zip(neighbor_indices, neighbor_distances):
        neighbor_track_id = data.index[idx]
        
        # Skip the input song itself
        if neighbor_track_id == song_idx:
            continue
        
        # Only include songs from the same cluster
        if data.loc[neighbor_track_id, 'cluster'] == song_cluster:
            similarity_score = 1 - dist  # Convert cosine distance to similarity
            filtered_recommendations.append({
                'track_id': neighbor_track_id,
                'similarity_score': similarity_score
            })
        
        # Stop when we have enough recommendations
        if len(filtered_recommendations) >= n:
            break
    
    # If cluster has fewer than n songs, return available ones
    if len(filtered_recommendations) == 0:
        raise ValueError(f"No songs found in the same cluster as '{song_title}'")
    
    # Create result dataframe
    result_data = []
    for rec in filtered_recommendations:
        track_id = rec['track_id']
        # Get $oid if available, otherwise None
        oid_value = data.loc[track_id, '_id.$oid'] if '_id.$oid' in data.columns else None
        result_data.append({
            '$oid': oid_value,
            'title': data.loc[track_id, 'title'],
            'artist': data.loc[track_id, 'artist'],
            'genre': data.loc[track_id, 'genre'],
            'similarity_score': rec['similarity_score']
        })
    
    result_df = pd.DataFrame(result_data)
    return result_df

print("✅ Recommendation function created")


STEP 5: RECOMMENDATION FUNCTION
✅ Recommendation function created


In [13]:
# ── STEP 6: Advanced Recommendation Features ────────────────────────────────────
print("=" * 50)
print("STEP 6: ADVANCED RECOMMENDATION FEATURES")
print("=" * 50)

def recommend_by_track_id(track_id, n=5):
    """
    Recommend songs based on a track_id.
    
    Parameters:
    -----------
    track_id : str
        Track ID of the song to find recommendations for
    n : int, default=5
        Number of recommendations to return
    
    Returns:
    --------
    pd.DataFrame
        DataFrame containing recommendations with $oid, title, artist, genre, similarity_score
    """
    if track_id not in data.index:
        raise ValueError(f"Track ID '{track_id}' not found in the dataset")
    
    # Get the cluster of the input song
    song_cluster = data.loc[track_id, 'cluster']
    
    # Get the position in the dataframe (for indexing X_scaled)
    song_position = data.index.get_loc(track_id)
    
    # Find nearest neighbors (get more than needed to account for cluster filtering)
    distances, indices = knn_model.kneighbors(
        X_scaled[song_position:song_position+1],
        n_neighbors=min(100, len(X_scaled))
    )
    
    # Filter neighbors to same cluster and exclude the input song itself
    neighbor_indices = indices[0]
    neighbor_distances = distances[0]
    
    # Map back to dataframe indices and filter by cluster
    filtered_recommendations = []
    for idx, dist in zip(neighbor_indices, neighbor_distances):
        neighbor_track_id = data.index[idx]
        
        # Skip the input song itself
        if neighbor_track_id == track_id:
            continue
        
        # Only include songs from the same cluster
        if data.loc[neighbor_track_id, 'cluster'] == song_cluster:
            similarity_score = 1 - dist
            filtered_recommendations.append({
                'track_id': neighbor_track_id,
                'similarity_score': similarity_score
            })
        
        # Stop when we have enough recommendations
        if len(filtered_recommendations) >= n:
            break
    
    # If cluster has fewer than n songs, return available ones
    if len(filtered_recommendations) == 0:
        raise ValueError(f"No songs found in the same cluster as track ID '{track_id}'")
    
    # Create result dataframe
    result_data = []
    for rec in filtered_recommendations:
        rec_track_id = rec['track_id']
        # Get $oid if available, otherwise None
        oid_value = data.loc[rec_track_id, '_id.$oid'] if '_id.$oid' in data.columns else None
        result_data.append({
            '$oid': oid_value,
            'title': data.loc[rec_track_id, 'title'],
            'artist': data.loc[rec_track_id, 'artist'],
            'genre': data.loc[rec_track_id, 'genre'],
            'similarity_score': rec['similarity_score']
        })
    
    result_df = pd.DataFrame(result_data)
    return result_df


def recommend_from_multiple_songs(song_titles, n=5):
    """
    Recommend songs based on multiple input songs by averaging their feature vectors.
    
    Parameters:
    -----------
    song_titles : list of str
        List of song titles to find recommendations for
    n : int, default=5
        Number of recommendations to return
    
    Returns:
    --------
    pd.DataFrame
        DataFrame containing recommendations with $oid, title, artist, genre, similarity_score
    """
    if not song_titles:
        raise ValueError("song_titles list cannot be empty")
    
    # Find all song indices and their clusters
    song_positions = []
    song_clusters = []
    
    for song_title in song_titles:
        song_mask = data['title'].str.lower() == song_title.lower()
        song_indices = data[song_mask].index.tolist()
        
        if len(song_indices) == 0:
            raise ValueError(f"Song '{song_title}' not found in the dataset")
        
        song_idx = song_indices[0]
        song_position = data.index.get_loc(song_idx)
        song_positions.append(song_position)
        song_clusters.append(data.loc[song_idx, 'cluster'])
    
    # Use the most common cluster (or first if tie)
    from collections import Counter
    cluster_counts = Counter(song_clusters)
    target_cluster = cluster_counts.most_common(1)[0][0]
    
    # Average the feature vectors
    feature_vectors = X_scaled[song_positions]
    averaged_vector = feature_vectors.mean(axis=0).reshape(1, -1)
    
    # Find nearest neighbors to averaged vector
    distances, indices = knn_model.kneighbors(
        averaged_vector,
        n_neighbors=min(100, len(X_scaled))
    )
    
    # Filter neighbors to same cluster and exclude input songs
    input_track_ids = set()
    for pos in song_positions:
        input_track_ids.add(data.index[pos])
    
    filtered_recommendations = []
    for idx, dist in zip(indices[0], distances[0]):
        neighbor_track_id = data.index[idx]
        
        # Skip input songs
        if neighbor_track_id in input_track_ids:
            continue
        
        # Only include songs from the target cluster
        if data.loc[neighbor_track_id, 'cluster'] == target_cluster:
            similarity_score = 1 - dist
            filtered_recommendations.append({
                'track_id': neighbor_track_id,
                'similarity_score': similarity_score
            })
        
        if len(filtered_recommendations) >= n:
            break
    
    if len(filtered_recommendations) == 0:
        raise ValueError(f"No songs found in cluster {target_cluster} matching the input songs")
    
    # Create result dataframe
    result_data = []
    for rec in filtered_recommendations:
        track_id = rec['track_id']
        # Get $oid if available, otherwise None
        oid_value = data.loc[track_id, '_id.$oid'] if '_id.$oid' in data.columns else None
        result_data.append({
            '$oid': oid_value,
            'title': data.loc[track_id, 'title'],
            'artist': data.loc[track_id, 'artist'],
            'genre': data.loc[track_id, 'genre'],
            'similarity_score': rec['similarity_score']
        })
    
    result_df = pd.DataFrame(result_data)
    return result_df

print("✅ Advanced recommendation functions created")


STEP 6: ADVANCED RECOMMENDATION FEATURES
✅ Advanced recommendation functions created


In [14]:
# ── STEP 7: Testing ──────────────────────────────────────────────────────────────
print("=" * 50)
print("STEP 7: TESTING RECOMMENDATION SYSTEM")
print("=" * 50)

# Test with a sample song
try:
    # Get a random song title for testing
    sample_song = data['title'].dropna().iloc[0]
    print(f"\nTesting recommendation for: '{sample_song}'")
    print("=" * 50)
    
    recommendations = recommend_songs(sample_song, n=10)
    print(f"\n✅ Found {len(recommendations)} recommendations:")
    print("\n" + recommendations.to_string(index=False))
    
except Exception as e:
    print(f"❌ Error during testing: {e}")
    import traceback
    traceback.print_exc()


STEP 7: TESTING RECOMMENDATION SYSTEM

Testing recommendation for: 'Enterprizin'

✅ Found 10 recommendations:

                    $oid                                                        title                      artist                               genre  similarity_score
696bde55eaeecc7f4fb9b080              The Minimum feat. Tiara Wiles {prod. 6th Sense}             The Kid Daytona                           [Hip-Hop]          0.977304
696bdfc0eaeecc7f4fb9cd5b                                         I Wanna Be Forgotten                 Monk Parker                  [Rock, Indie-Rock]          0.896313
696bdeaaeaeecc7f4fb9b0c2                                                       Euro's                   D'r Sjaak             [Pop, Disco, Synth Pop]          0.847076
696bdef8eaeecc7f4fb9baaf                                                    Occidente                        Rein                         [Pop, Rock]          0.844137
696bdf99eaeecc7f4fb9cbd6                         