In [1]:
# ================================
# 1. Import Necessary Libraries
# ================================

import pandas as pd
import numpy as np
import random
import ast
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, hstack
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

# ================================
# 2. Define Data Directory and File Paths
# ================================

DATA_DIR = 'data/'

INFORMATION_FILE = DATA_DIR + 'id_information_mmsr.tsv'
GENRES_FILE = DATA_DIR + 'id_genres_mmsr.tsv'
LYRICS_TFIDF_FILE = DATA_DIR + 'id_lyrics_tf-idf_mmsr.tsv'
LYRICS_BERT_FILE = DATA_DIR + 'id_lyrics_bert_mmsr.tsv'
MFCC_BOW_FILE = DATA_DIR + 'id_mfcc_bow_mmsr.tsv'
SPECTRAL_CONTRAST_FILE = DATA_DIR + 'id_blf_spectralcontrast_mmsr.tsv'
VGG19_FILE = DATA_DIR + 'id_vgg19_mmsr.tsv'
RESNET_FILE = DATA_DIR + 'id_resnet_mmsr.tsv'
TAGS_FILE = DATA_DIR + 'id_tags_dict.tsv'
METADATA_FILE = DATA_DIR + 'id_metadata_mmsr.tsv'

# ================================
# 3. Load Datasets
# ================================

def load_dataframe(file_path, sep='\t', header='infer', names=None):
    """
    Utility function to load a TSV file into a pandas DataFrame.
    """
    try:
        df = pd.read_csv(file_path, sep=sep, header=header, names=names)
        print(f"Loaded DataFrame from '{file_path}' with shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        exit(1)

# Load Information Dataset
information_df = load_dataframe(INFORMATION_FILE)

# Load Genres Dataset
genres_df = load_dataframe(GENRES_FILE)

# Load Metadata Dataset
metadata_df = load_dataframe(METADATA_FILE)
print("\nMetadata DataFrame Columns:")
print(metadata_df.columns.tolist())

# Load Tags Dataset
tags_df = load_dataframe(TAGS_FILE, header=None, names=['id', 'tags_str'])

# Load Lyrics TF-IDF
lyrics_tfidf_df = load_dataframe(LYRICS_TFIDF_FILE)
tfidf_cols = [col for col in lyrics_tfidf_df.columns if col != 'id']
lyrics_tfidf_df.rename(columns={col: f"tfidf_{col}" for col in tfidf_cols}, inplace=True)
print("Renamed TF-IDF feature columns to prevent conflicts.")

# Load BERT Embeddings
bert_df = load_dataframe(LYRICS_BERT_FILE)
bert_feature_columns = [col for col in bert_df.columns if col != 'id']
bert_df.rename(columns={col: f"bert_{col}" for col in bert_feature_columns}, inplace=True)
print("Renamed BERT feature columns to prevent conflicts.")

# Load MFCC Bag-of-Words
mfcc_bow_df = load_dataframe(MFCC_BOW_FILE)
mfcc_bow_columns = [col for col in mfcc_bow_df.columns if col != 'id']
mfcc_bow_df.rename(columns={col: f"mfcc_{col}" for col in mfcc_bow_columns}, inplace=True)
print("Renamed MFCC Bag-of-Words feature columns to prevent conflicts.")

# Load Spectral Contrast
spectral_contrast_df = load_dataframe(SPECTRAL_CONTRAST_FILE)
spectral_contrast_columns = [col for col in spectral_contrast_df.columns if col != 'id']
spectral_contrast_df.rename(columns={col: f"spectral_{col}" for col in spectral_contrast_columns}, inplace=True)
print("Renamed Spectral Contrast feature columns to prevent conflicts.")

# Load VGG19 Features
vgg19_df = load_dataframe(VGG19_FILE)
vgg19_feature_columns = [col for col in vgg19_df.columns if col != 'id']
vgg19_df.rename(columns={col: f"vgg19_{col}" for col in vgg19_feature_columns}, inplace=True)
print("Renamed VGG19 feature columns to prevent conflicts.")

# Load ResNet Features
resnet_df = load_dataframe(RESNET_FILE)
resnet_feature_columns = [col for col in resnet_df.columns if col != 'id']
resnet_df.rename(columns={col: f"resnet_{col}" for col in resnet_feature_columns}, inplace=True)
print("Renamed ResNet feature columns to prevent conflicts.")

# ================================
# 4. Merge and Preprocess DataFrames
# ================================

# Merge Information and Metadata
catalog_df = pd.merge(information_df, metadata_df[['id', 'popularity']], on='id', how='left')
print(f"\nMerged catalog_df shape after adding Metadata: {catalog_df.shape}")

# Verify 'popularity' Column
if 'popularity' in catalog_df.columns:
    print("\n'popularity' column successfully added to catalog_df.")
    print(catalog_df[['id', 'artist', 'song', 'popularity']].head())
else:
    print("Error: 'popularity' column is still missing after merging Metadata.")
    exit(1)

# Handle Missing 'popularity' Values
missing_popularity = catalog_df['popularity'].isnull().sum()
print(f"\nNumber of tracks with missing 'popularity': {missing_popularity}")

if missing_popularity > 0:
    # Drop tracks with missing 'popularity'
    initial_size = len(catalog_df)
    catalog_df = catalog_df.dropna(subset=['popularity']).reset_index(drop=True)
    final_size = len(catalog_df)
    dropped = initial_size - final_size
    print(f"Dropped {dropped} tracks due to missing 'popularity' values.")
else:
    print("No missing 'popularity' values found.")

# Function to parse genres from string to list
def parse_genres(genre_str):
    if pd.isnull(genre_str):
        return []
    return [genre.strip() for genre in genre_str.split(',')]

# Apply parsing to 'genre' column
genres_df['genre'] = genres_df['genre'].apply(parse_genres)

# Merge Information and Genres to update catalog_df
catalog_df = pd.merge(catalog_df, genres_df, on='id', how='left')
print(f"Merged catalog_df shape after merging Genres: {catalog_df.shape}")

# Handle missing genres by assigning an empty list
catalog_df['genre'] = catalog_df['genre'].apply(lambda x: x if isinstance(x, list) else [])

# Function to get the top genre from the genre list
def get_top_genre(genres_list):
    if not genres_list:
        return None
    return genres_list[0]  # Modify as needed

# Apply the function to determine the top genre
catalog_df['top_genre'] = catalog_df['genre'].apply(get_top_genre)

# Display sample data
print("\nSample of catalog_df:")
print(catalog_df[['id', 'artist', 'song', 'top_genre']].head())

# Merge Tags into catalog_df
def parse_tags(tag_str):
    if pd.isnull(tag_str):
        return []
    try:
        tags_dict = ast.literal_eval(tag_str)
        if isinstance(tags_dict, dict):
            return list(tags_dict.keys())
        else:
            return []
    except (ValueError, SyntaxError):
        return []

tags_df['tags'] = tags_df['tags_str'].apply(parse_tags)

catalog_df = pd.merge(catalog_df, tags_df[['id', 'tags']], on='id', how='left')
print(f"Merged catalog_df shape after merging Tags: {catalog_df.shape}")

# Handle missing tags by assigning an empty list
catalog_df['tags'] = catalog_df['tags'].apply(lambda x: x if isinstance(x, list) else [])

# Display sample data to verify
print("\nSample of catalog_df after merging tags:")
print(catalog_df[['id', 'artist', 'song', 'top_genre', 'tags']].head())

Loaded DataFrame from 'data/id_information_mmsr.tsv' with shape: (5148, 4)
Loaded DataFrame from 'data/id_genres_mmsr.tsv' with shape: (5148, 2)
Loaded DataFrame from 'data/id_metadata_mmsr.tsv' with shape: (5148, 11)

Metadata DataFrame Columns:
['id', 'spotify_id', 'popularity', 'release', 'danceability', 'energy', 'key', 'mode', 'valence', 'tempo', 'duration_ms']
Loaded DataFrame from 'data/id_tags_dict.tsv' with shape: (5149, 2)
Loaded DataFrame from 'data/id_lyrics_tf-idf_mmsr.tsv' with shape: (5148, 1001)
Renamed TF-IDF feature columns to prevent conflicts.
Loaded DataFrame from 'data/id_lyrics_bert_mmsr.tsv' with shape: (5148, 769)
Renamed BERT feature columns to prevent conflicts.
Loaded DataFrame from 'data/id_mfcc_bow_mmsr.tsv' with shape: (5148, 501)
Renamed MFCC Bag-of-Words feature columns to prevent conflicts.
Loaded DataFrame from 'data/id_blf_spectralcontrast_mmsr.tsv' with shape: (5148, 801)
Renamed Spectral Contrast feature columns to prevent conflicts.
Loaded DataFra

In [2]:
# ================================
# 5. Process and Integrate TF-IDF Features
# ================================

# Check and load Lyrics TF-IDF DataFrame
if 'tfidf_id' in lyrics_tfidf_df.columns or 'id' in lyrics_tfidf_df.columns:
    print("\nProcessing Lyrics TF-IDF DataFrame...")
else:
    print("Error: Unexpected columns in Lyrics TF-IDF DataFrame.")
    exit(1)

# Merge TF-IDF with catalog_df
catalog_df = pd.merge(catalog_df, lyrics_tfidf_df, on='id', how='left')
print(f"Merged catalog_df shape after adding TF-IDF features: {catalog_df.shape}")

# Identify TF-IDF feature columns (exclude 'id')
tfidf_feature_columns = [col for col in lyrics_tfidf_df.columns if col != 'id']
print(f"Number of TF-IDF feature columns: {len(tfidf_feature_columns)}")

# Check for missing TF-IDF columns
missing_tfidf_columns = [col for col in tfidf_feature_columns if col not in catalog_df.columns]
if missing_tfidf_columns:
    print(f"Missing TF-IDF columns in catalog_df: {missing_tfidf_columns[:10]}")
else:
    print("All TF-IDF columns are present in catalog_df.")

# Check for missing values in TF-IDF features
missing_tfidf = catalog_df[tfidf_feature_columns].isnull().sum().sum()
print(f"Total missing TF-IDF values: {missing_tfidf}")

if missing_tfidf > 0:
    # Drop tracks with missing TF-IDF features
    initial_size = len(catalog_df)
    catalog_df = catalog_df.dropna(subset=tfidf_feature_columns).reset_index(drop=True)
    final_size = len(catalog_df)
    dropped = initial_size - final_size
    print(f"Dropped {dropped} tracks due to missing TF-IDF features.")
else:
    print("No missing TF-IDF values found.")

# Extract TF-IDF features
tfidf_features = catalog_df[tfidf_feature_columns].values

# Normalize TF-IDF features
tfidf_matrix = normalize(tfidf_features, norm='l2')

# Convert to sparse matrix for efficiency
tfidf_matrix = csr_matrix(tfidf_matrix)
print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")

# ================================
# 6. Process and Integrate BERT Features
# ================================

print("\nProcessing BERT Embeddings...")

# Merge BERT embeddings with catalog_df
catalog_df = pd.merge(catalog_df, bert_df, on='id', how='left')
print(f"Merged catalog_df shape after adding BERT embeddings: {catalog_df.shape}")

# Handle missing BERT embeddings by dropping tracks with missing BERT features
bert_feature_columns = [f"bert_{col}" for col in bert_feature_columns]  # Updated after renaming
missing_bert = catalog_df[bert_feature_columns].isnull().sum().sum()
if missing_bert > 0:
    initial_size = len(catalog_df)
    catalog_df = catalog_df.dropna(subset=bert_feature_columns).reset_index(drop=True)
    final_size = len(catalog_df)
    dropped = initial_size - final_size
    print(f"Dropped {dropped} tracks due to missing BERT embeddings.")
else:
    print("No missing BERT embeddings found.")

# Extract BERT features
bert_features = catalog_df[bert_feature_columns].values

# Convert to sparse matrix for efficiency (optional: depending on similarity measure)
bert_matrix = csr_matrix(bert_features)
print(f"BERT Matrix Shape: {bert_matrix.shape}")

# ================================
# 7. Process and Integrate Audio Features (MFCC and Spectral Contrast)
# ================================

print("\nProcessing Audio Features (MFCC and Spectral Contrast)...")

# Merge MFCC and Spectral Contrast with catalog_df
catalog_df = pd.merge(catalog_df, mfcc_bow_df, on='id', how='left')
catalog_df = pd.merge(catalog_df, spectral_contrast_df, on='id', how='left')
print(f"Merged catalog_df shape after adding Audio features: {catalog_df.shape}")

# Handle missing Audio features by dropping tracks with missing audio features
audio_feature_columns = [f"mfcc_{col}" for col in mfcc_bow_columns] + [f"spectral_{col}" for col in spectral_contrast_columns]
missing_audio = catalog_df[audio_feature_columns].isnull().sum().sum()
print(f"Total missing Audio feature values: {missing_audio}")

if missing_audio > 0:
    initial_size = len(catalog_df)
    catalog_df = catalog_df.dropna(subset=audio_feature_columns).reset_index(drop=True)
    final_size = len(catalog_df)
    dropped = initial_size - final_size
    print(f"Dropped {dropped} tracks due to missing Audio features.")
else:
    print("No missing Audio features found.")

# Extract MFCC features
mfcc_bow_features = catalog_df[[f"mfcc_{col}" for col in mfcc_bow_columns]].values
mfcc_bow_matrix = csr_matrix(mfcc_bow_features)
print(f"\nMFCC Bag-of-Words Matrix Shape: {mfcc_bow_matrix.shape}")

# Extract Spectral Contrast features
spectral_contrast_features = catalog_df[[f"spectral_{col}" for col in spectral_contrast_columns]].values
spectral_contrast_matrix = csr_matrix(spectral_contrast_features)
print(f"Spectral Contrast Matrix Shape: {spectral_contrast_matrix.shape}")

# ================================
# 8. Process and Integrate VGG19 Features
# ================================

print("\nProcessing VGG19 Features...")

# Merge VGG19 with catalog_df
catalog_df = pd.merge(catalog_df, vgg19_df, on='id', how='left')
print(f"Merged catalog_df shape after adding VGG19 features: {catalog_df.shape}")

# Handle missing VGG19 features by dropping tracks with missing VGG19 features
vgg19_feature_columns = [f"vgg19_{col}" for col in vgg19_feature_columns]  # Updated after renaming
missing_vgg19 = catalog_df[vgg19_feature_columns].isnull().sum().sum()
if missing_vgg19 > 0:
    initial_size = len(catalog_df)
    catalog_df = catalog_df.dropna(subset=vgg19_feature_columns).reset_index(drop=True)
    final_size = len(catalog_df)
    dropped = initial_size - final_size
    print(f"Dropped {dropped} tracks due to missing VGG19 features.")
else:
    print("No missing VGG19 features found.")

# Extract VGG19 features
vgg19_features = catalog_df[vgg19_feature_columns].values
vgg19_matrix = csr_matrix(vgg19_features)
print(f"VGG19 Matrix Shape: {vgg19_matrix.shape}")

# ================================
# 9. Process and Integrate ResNet Features
# ================================

print("\nProcessing ResNet Features...")

# Merge ResNet with catalog_df
catalog_df = pd.merge(catalog_df, resnet_df, on='id', how='left')
print(f"Merged catalog_df shape after adding ResNet features: {catalog_df.shape}")

# Handle missing ResNet features by dropping tracks with missing ResNet features
resnet_feature_columns = [f"resnet_{col}" for col in resnet_feature_columns]  # Updated after renaming
missing_resnet = catalog_df[resnet_feature_columns].isnull().sum().sum()
if missing_resnet > 0:
    initial_size = len(catalog_df)
    catalog_df = catalog_df.dropna(subset=resnet_feature_columns).reset_index(drop=True)
    final_size = len(catalog_df)
    dropped = initial_size - final_size
    print(f"Dropped {dropped} tracks due to missing ResNet features.")
else:
    print("No missing ResNet features found.")

# Extract ResNet features
resnet_features = catalog_df[resnet_feature_columns].values
resnet_matrix = csr_matrix(resnet_features)
print(f"ResNet Matrix Shape: {resnet_matrix.shape}")


Processing Lyrics TF-IDF DataFrame...
Merged catalog_df shape after adding TF-IDF features: (5148, 1008)
Number of TF-IDF feature columns: 1000
All TF-IDF columns are present in catalog_df.
Total missing TF-IDF values: 0
No missing TF-IDF values found.

TF-IDF Matrix Shape: (5148, 1000)

Processing BERT Embeddings...
Merged catalog_df shape after adding BERT embeddings: (5148, 1776)
No missing BERT embeddings found.
BERT Matrix Shape: (5148, 768)

Processing Audio Features (MFCC and Spectral Contrast)...
Merged catalog_df shape after adding Audio features: (5148, 3076)
Total missing Audio feature values: 0
No missing Audio features found.

MFCC Bag-of-Words Matrix Shape: (5148, 500)
Spectral Contrast Matrix Shape: (5148, 800)

Processing VGG19 Features...
Merged catalog_df shape after adding VGG19 features: (5148, 11268)
No missing VGG19 features found.
VGG19 Matrix Shape: (5148, 8192)

Processing ResNet Features...
Merged catalog_df shape after adding ResNet features: (5148, 15364)
N

In [3]:
# ================================
# 10. Define Retrieval Functions
# ================================

def random_retrieval(query_track_id, id_to_index=None, feature_matrix=None, track_ids=None, catalog_df=None, N=10):
    """
    Randomly selects N tracks from the catalog, excluding the query track.

    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict, optional): Mapping from track ID to index in feature_matrix (unused).
    - feature_matrix (csr_matrix, optional): Feature matrix used by other retrieval functions (unused).
    - track_ids (list, optional): List of all track IDs (unused).
    - catalog_df (pd.DataFrame): The catalog containing all tracks.
    - N (int): Number of tracks to retrieve.

    Returns:
    - retrieved_tracks (pd.DataFrame): DataFrame of retrieved tracks.
    """
    if catalog_df is None:
        raise ValueError("catalog_df must be provided for Random Retrieval.")
    
    # Exclude the query track
    candidates = catalog_df[catalog_df['id'] != query_track_id]
    
    # Determine the number of tracks to sample
    sample_size = min(N, len(candidates))
    
    # Randomly sample N tracks
    retrieved_tracks = candidates.sample(n=sample_size, replace=False, random_state=random.randint(0, 1000000))
    
    return retrieved_tracks.reset_index(drop=True)

# Define and Initialize tfidf_retrieval function
def tfidf_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks most similar to the query track based on TF-IDF cosine similarity.

    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): TF-IDF feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.

    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with similarity scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()

    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index]

    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, feature_matrix).flatten()

    # Exclude the query track
    similarities[query_index] = -1

    # Get top N indices
    top_indices = similarities.argsort()[-N:][::-1]

    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_scores = [similarities[i] for i in top_indices]

    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'similarity': retrieved_scores
    })

    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')

    return retrieved_tracks_df

def bert_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks most similar to the query track based on BERT cosine similarity.

    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): BERT feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.

    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with similarity scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index]
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, feature_matrix).flatten()
    
    # Exclude the query track
    similarities[query_index] = -1
    
    # Get top N indices
    top_indices = similarities.argsort()[-N:][::-1]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_scores = [similarities[i] for i in top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'similarity': retrieved_scores
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

def mfcc_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks most similar to the query track based on MFCC Euclidean distance.

    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): MFCC Bag-of-Words feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.

    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with distance scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index].toarray()
    
    # Compute Euclidean distances
    # To optimize, use vectorized operations without converting entire matrix to dense
    distances = np.linalg.norm(feature_matrix - query_vector, axis=1)
    
    # Exclude the query track
    distances[query_index] = np.inf
    
    # Get top N indices with smallest distances
    top_indices = distances.argsort()[:N]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_distances = distances[top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'distance': retrieved_distances
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

def spectral_contrast_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks most similar to the query track based on Spectral Contrast Cosine similarity.

    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): Spectral Contrast feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.

    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with similarity scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index]
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, feature_matrix).flatten()
    
    # Exclude the query track
    similarities[query_index] = -1
    
    # Get top N indices
    top_indices = similarities.argsort()[-N:][::-1]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_scores = [similarities[i] for i in top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'similarity': retrieved_scores
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

def vgg19_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks most similar to the query track based on VGG19 Cosine similarity.

    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): VGG19 feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.

    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with similarity scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index]
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, feature_matrix).flatten()
    
    # Exclude the query track
    similarities[query_index] = -1
    
    # Get top N indices
    top_indices = similarities.argsort()[-N:][::-1]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_scores = [similarities[i] for i in top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'similarity': retrieved_scores
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

def resnet_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks most similar to the query track based on ResNet Euclidean distance.

    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): ResNet feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.

    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with distance scores.
    """
    
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index].toarray()
    
    # Compute Euclidean distances
    # To optimize, use vectorized operations without converting entire matrix to dense
    distances = np.linalg.norm(feature_matrix - query_vector, axis=1)
    
    # Exclude the query track
    distances[query_index] = np.inf
    
    # Get top N indices with smallest distances
    top_indices = distances.argsort()[:N]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_distances = distances[top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'distance': retrieved_distances
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

# ================================
# 11. Process and Integrate Tag Features
# ================================

def build_tag_vocabulary(catalog_df):
    """
    Builds a tag vocabulary from the catalog.
    
    Parameters:
    - catalog_df (pd.DataFrame): DataFrame containing 'id' and 'tags'.
    
    Returns:
    - tag_to_index (dict): Mapping from tag to index.
    """
    all_tags = set(tag for tags in catalog_df['tags'] for tag in tags)
    tag_to_index = {tag: idx for idx, tag in enumerate(sorted(all_tags))}
    return tag_to_index

def vectorize_tags(catalog_df, tag_to_index):
    """
    Vectorizes the tags for each track.
    
    Parameters:
    - catalog_df (pd.DataFrame): DataFrame containing 'id' and 'tags'.
    - tag_to_index (dict): Mapping from tag to index.
    
    Returns:
    - tag_matrix (csr_matrix): Sparse matrix of tag vectors.
    """
    row_indices = []
    col_indices = []
    data = []
    
    for row, tags in enumerate(catalog_df['tags']):
        for tag in tags:
            if tag in tag_to_index:
                col = tag_to_index[tag]
                row_indices.append(row)
                col_indices.append(col)
                data.append(1)  # Assuming binary occurrence; modify if weights are available
    
    tag_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(len(catalog_df), len(tag_to_index)))
    tag_matrix = normalize(tag_matrix, norm='l2')
    return tag_matrix

# Building Tag Vocabulary
tag_to_index = build_tag_vocabulary(catalog_df)
print(f"Total unique tags: {len(tag_to_index)}")

# Vectorizing Tags
tag_matrix = vectorize_tags(catalog_df, tag_to_index)
print(f"Tag Matrix Shape: {tag_matrix.shape}")

def tag_based_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks most similar to the query track based on tag similarity.
    
    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): Tag feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.
    
    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with similarity scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index]
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, feature_matrix).flatten()
    
    # Exclude the query track
    similarities[query_index] = -1
    
    # Get top N indices
    top_indices = similarities.argsort()[-N:][::-1]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_scores = [similarities[i] for i in top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'similarity': retrieved_scores
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

# ================================
# 12. Define Fusion Retrievals
# ================================

def early_fusion_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Retrieves N tracks using Early Fusion by combining TF-IDF and BERT feature matrices.
    
    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix.
    - feature_matrix (csr_matrix): Combined feature matrix (TF-IDF + BERT).
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.
    
    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with similarity scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    query_vector = feature_matrix[query_index]
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, feature_matrix).flatten()
    
    # Exclude the query track
    similarities[query_index] = -1
    
    # Get top N indices
    top_indices = similarities.argsort()[-N:][::-1]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_scores = [similarities[i] for i in top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'similarity': retrieved_scores
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

def late_fusion_retrieval(query_track_id, id_to_index, feature_matrix1, feature_matrix2, track_ids, catalog_df, N=10, weight1=0.5, weight2=0.5):
    """
    Retrieves N tracks using Late Fusion by combining MFCC and VGG19 retrieval scores.
    
    Parameters:
    - query_track_id (str): The ID of the query track.
    - id_to_index (dict): Mapping from track ID to index in feature_matrix1 and feature_matrix2.
    - feature_matrix1 (csr_matrix): MFCC feature matrix.
    - feature_matrix2 (csr_matrix): VGG19 feature matrix.
    - track_ids (list): List of all track IDs.
    - catalog_df (pd.DataFrame): DataFrame containing track metadata.
    - N (int): Number of tracks to retrieve.
    - weight1 (float): Weight for MFCC similarity.
    - weight2 (float): Weight for VGG19 similarity.
    
    Returns:
    - retrieved_tracks_df (pd.DataFrame): DataFrame of retrieved tracks with aggregated similarity scores.
    """
    if query_track_id not in id_to_index:
        return pd.DataFrame()
    
    query_index = id_to_index[query_track_id]
    
    # Compute cosine similarity for MFCC
    query_vector1 = feature_matrix1[query_index]
    similarities1 = cosine_similarity(query_vector1, feature_matrix1).flatten()
    similarities1[query_index] = -1  # Exclude query
    
    # Compute cosine similarity for VGG19
    query_vector2 = feature_matrix2[query_index]
    similarities2 = cosine_similarity(query_vector2, feature_matrix2).flatten()
    similarities2[query_index] = -1  # Exclude query
    
    # Weighted average of similarities
    aggregated_similarities = weight1 * similarities1 + weight2 * similarities2
    
    # Get top N indices
    top_indices = aggregated_similarities.argsort()[-N:][::-1]
    
    retrieved_ids = [track_ids[i] for i in top_indices]
    retrieved_scores = [aggregated_similarities[i] for i in top_indices]
    
    retrieved_tracks_df = pd.DataFrame({
        'id': retrieved_ids,
        'aggregated_similarity': retrieved_scores
    })
    
    retrieved_tracks_df = pd.merge(retrieved_tracks_df, catalog_df[['id', 'artist', 'song', 'album_name']], on='id', how='left')
    
    return retrieved_tracks_df

# ================================
# 13. Define Wrapper Functions for Hybrid Systems
# ================================

def tag_based_retrieval_wrapper(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    return tag_based_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N)

def early_fusion_retrieval_wrapper(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    return early_fusion_retrieval(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N)

def late_fusion_retrieval_wrapper(query_track_id, id_to_index, feature_matrix, track_ids, catalog_df, N=10):
    """
    Wrapper function to perform Late Fusion Retrieval combining MFCC and VGG19.
    Assumes 'MFCC Retrieval' and 'VGG19 Retrieval' are in feature_matrices.
    """
    feature_matrix1 = feature_matrices['MFCC Retrieval']
    feature_matrix2 = feature_matrices['VGG19 Retrieval']
    return late_fusion_retrieval(
        query_track_id,
        id_to_index,
        feature_matrix1,
        feature_matrix2,
        track_ids,
        catalog_df,
        N=N,
        weight1=0.5,  # Equal weights; adjust as needed
        weight2=0.5
    )

# ================================
# 14. Initialize Feature Matrices and Retrieval Systems
# ================================

# Initialize feature matrices for retrieval functions that require them
feature_matrices = {
    'TF-IDF Retrieval': tfidf_matrix,
    'BERT Retrieval': bert_matrix,
    'MFCC Retrieval': mfcc_bow_matrix,
    'Spectral Contrast Retrieval': spectral_contrast_matrix,
    'VGG19 Retrieval': vgg19_matrix,
    'ResNet Retrieval': resnet_matrix,
    'Tag-Based Retrieval': tag_matrix,  # New
    # 'Early Fusion TF-IDF+BERT Retrieval': early_fusion_retrieval(query_track_id=None, id_to_index=None, feature_matrix=None, track_ids=None, catalog_df=None, N=10)  # Placeholder
    # 'Late Fusion MFCC+VGG19 Retrieval' does not require a separate feature matrix as it uses existing ones
}

# Define all retrieval systems
retrieval_systems = {
    'TF-IDF Retrieval': tfidf_retrieval,
    'Random Retrieval': random_retrieval,
    'BERT Retrieval': bert_retrieval,
    'MFCC Retrieval': mfcc_retrieval,
    'Spectral Contrast Retrieval': spectral_contrast_retrieval,
    'VGG19 Retrieval': vgg19_retrieval,
    'ResNet Retrieval': resnet_retrieval,
    'Tag-Based Retrieval': tag_based_retrieval_wrapper,  # New
    'Early Fusion TF-IDF+BERT Retrieval': early_fusion_retrieval_wrapper,  # New
    'Late Fusion MFCC+VGG19 Retrieval': late_fusion_retrieval_wrapper  # New
}

Total unique tags: 43845
Tag Matrix Shape: (5148, 43845)


In [4]:
# ================================
# 15. Define Evaluation Metrics
# ================================

def precision_at_k(retrieved_ids, relevant_ids, k=10):
    """
    Computes Precision@k.

    Parameters:
    - retrieved_ids (list): List of retrieved track IDs.
    - relevant_ids (list): List of relevant track IDs.
    - k (int): Number of top tracks to consider.

    Returns:
    - float: Precision@k value.
    """
    retrieved_set = set(retrieved_ids[:k])
    relevant_set = set(relevant_ids)
    precision = len(retrieved_set & relevant_set) / k
    return precision

def recall_at_k(retrieved_ids, relevant_ids, k=10):
    """
    Computes Recall@k.

    Parameters:
    - retrieved_ids (list): List of retrieved track IDs.
    - relevant_ids (list): List of relevant track IDs.
    - k (int): Number of top tracks to consider.

    Returns:
    - float: Recall@k value.
    """
    retrieved_set = set(retrieved_ids[:k])
    relevant_set = set(relevant_ids)
    recall = len(retrieved_set & relevant_set) / len(relevant_set) if relevant_set else 0
    return recall

def ndcg_at_k(retrieved_ids, relevant_ids, k=10):
    """
    Computes NDCG@k.

    Parameters:
    - retrieved_ids (list): List of retrieved track IDs.
    - relevant_ids (list): List of relevant track IDs.
    - k (int): Number of top tracks to consider.

    Returns:
    - float: NDCG@k value.
    """
    dcg = 0.0
    for i, track_id in enumerate(retrieved_ids[:k]):
        if track_id in relevant_ids:
            dcg += 1 / np.log2(i + 2)

    # Ideal DCG
    ideal_relevant = min(len(relevant_ids), k)
    ideal_dcg = sum([1 / np.log2(i + 2) for i in range(ideal_relevant)])

    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mrr_metric(retrieved_ids, relevant_ids):
    """
    Computes Mean Reciprocal Rank (MRR).

    Parameters:
    - retrieved_ids (list): List of retrieved track IDs.
    - relevant_ids (list): List of relevant track IDs.

    Returns:
    - float: MRR value.
    """
    for rank, track_id in enumerate(retrieved_ids, start=1):
        if track_id in relevant_ids:
            return 1.0 / rank
    return 0.0

def compute_cov_at_n(all_retrieved_ids, catalog_df, N=10):
    """
    Computes Coverage@N: Percentage of songs that appear in at least one retrieval list.

    Parameters:
    - all_retrieved_ids (list of lists): Each sublist contains retrieved IDs for a query.
    - catalog_df (pd.DataFrame): DataFrame containing all tracks with 'id'.
    - N (int): Number of retrievals per query.

    Returns:
    - float: Coverage percentage.
    """
    # Flatten the list of lists and consider only top N
    flattened = [track_id for sublist in all_retrieved_ids for track_id in sublist[:N]]
    unique_retrieved = set(flattened)
    total_tracks = len(catalog_df)
    coverage = (len(unique_retrieved) / total_tracks) * 100
    return coverage

def compute_div_at_n(all_retrieved_tags, N=10):
    """
    Computes Diversity@N: Average number of unique tag occurrences among retrieved songs.

    Parameters:
    - all_retrieved_tags (list of lists): Each sublist contains tags of retrieved songs for a query.
    - N (int): Number of retrievals per query.

    Returns:
    - float: Average diversity.
    """
    diversity_scores = []
    for tags in all_retrieved_tags:
        # Consider only top N tags
        top_n_tags = tags[:N]
        unique_tags = set(top_n_tags)
        diversity_scores.append(len(unique_tags))
    average_diversity = np.mean(diversity_scores) if diversity_scores else 0
    return average_diversity

def compute_avg_pop_at_n(all_retrieved_popularity, N=10):
    """
    Computes AvgPop@N: Average popularity of retrieved songs.

    Parameters:
    - all_retrieved_popularity (list of lists): Each sublist contains popularity scores of retrieved songs for a query.
    - N (int): Number of retrievals per query.

    Returns:
    - float: Average popularity.
    """
    avg_popularity_scores = []
    for pops in all_retrieved_popularity:
        top_n_pops = pops[:N]
        avg_popularity_scores.append(np.mean(top_n_pops))
    average_popularity = np.mean(avg_popularity_scores) if avg_popularity_scores else 0
    return average_popularity

def compute_genre_diversity(all_retrieved_genres, N=10):
    """
    Computes Genre Diversity@N: Average number of unique genres among retrieved songs.

    Parameters:
    - all_retrieved_genres (list of lists): Each sublist contains genres of retrieved songs for a query.
    - N (int): Number of retrievals per query.

    Returns:
    - float: Average genre diversity.
    """
    diversity_scores = []
    for genres in all_retrieved_genres:
        # Consider only top N genres
        top_n_genres = genres[:N]
        unique_genres = set(top_n_genres)
        diversity_scores.append(len(unique_genres))
    average_diversity = np.mean(diversity_scores) if diversity_scores else 0
    return average_diversity

def compute_popularity_diversity(all_retrieved_popularity, N=10):
    """
    Computes Popularity Diversity@N: Variance of popularity scores among retrieved songs.

    Parameters:
    - all_retrieved_popularity (list of lists): Each sublist contains popularity scores of retrieved songs for a query.
    - N (int): Number of retrievals per query.

    Returns:
    - float: Average popularity diversity.
    """
    diversity_scores = []
    for pops in all_retrieved_popularity:
        top_n_pops = pops[:N]
        if len(top_n_pops) > 1:
            diversity_scores.append(np.var(top_n_pops))
        else:
            diversity_scores.append(0)
    average_diversity = np.mean(diversity_scores) if diversity_scores else 0
    return average_diversity

In [5]:
# ================================
# 16. Define Evaluation Function
# ================================

def evaluate_retrieval_system(
    catalog_df,
    track_ids,
    id_to_index,
    retrieval_function,
    feature_matrix=None,
    N=10
):
    """
    Evaluates a retrieval system, computing both accuracy and beyond-accuracy metrics.

    Parameters:
    - catalog_df (pd.DataFrame): DataFrame containing all tracks with 'id', 'tags', 'popularity', 'top_genre', and 'genre'.
    - track_ids (list): List of track IDs.
    - id_to_index (dict): Mapping from track ID to index.
    - retrieval_function (function): The specific retrieval function for the IR system.
    - feature_matrix (csr_matrix, optional): Feature matrix used by the retrieval function.
    - N (int): Number of tracks to retrieve per query.

    Returns:
    - metrics (dict): Dictionary containing all evaluation metrics.
    """
    precisions = []
    recalls = []
    ndcgs = []
    mrrs = []

    all_retrieved_ids = []
    all_retrieved_tags = []
    all_retrieved_genres = []
    all_retrieved_popularity = []

    total_queries = len(catalog_df)
    processed_queries = 0

    for index, query_track in catalog_df.iterrows():
        query_id = query_track['id']
        query_genre = query_track['top_genre']

        if not query_genre:
            continue

        # Perform retrieval
        if feature_matrix is not None:
            retrieved_tracks = retrieval_function(
                query_track_id=query_id,
                id_to_index=id_to_index,
                feature_matrix=feature_matrix,
                track_ids=track_ids,
                catalog_df=catalog_df,
                N=N
            )
        else:
            retrieved_tracks = retrieval_function(
                query_track_id=query_id,
                id_to_index=id_to_index,
                feature_matrix=None,
                track_ids=track_ids,
                catalog_df=catalog_df,
                N=N
            )

        if retrieved_tracks.empty:
            continue

        retrieved_ids = retrieved_tracks['id'].tolist()

        # Extract tags, genres, and popularity
        retrieved_subset = catalog_df[catalog_df['id'].isin(retrieved_ids)]
        retrieved_tags = retrieved_subset['tags'].tolist()
        retrieved_genres = retrieved_subset['genre'].tolist()
        retrieved_popularity = retrieved_subset['popularity'].tolist()

        # Compute accuracy metrics
        relevant_ids = catalog_df[catalog_df['top_genre'] == query_genre]['id'].tolist()
        p = precision_at_k(retrieved_ids, relevant_ids, k=N)
        r = recall_at_k(retrieved_ids, relevant_ids, k=N)
        ndcg = ndcg_at_k(retrieved_ids, relevant_ids, k=N)
        rr = mrr_metric(retrieved_ids, relevant_ids)

        precisions.append(p)
        recalls.append(r)
        ndcgs.append(ndcg)
        mrrs.append(rr)

        # Collect data for beyond-accuracy metrics
        all_retrieved_ids.append(retrieved_ids)
        # Flatten the list of tags and genres for each retrieved song
        flattened_tags = [tag for sublist in retrieved_tags for tag in sublist]
        flattened_genres = [genre for sublist in retrieved_genres for genre in sublist]
        all_retrieved_tags.append(flattened_tags)
        all_retrieved_genres.append(flattened_genres)
        all_retrieved_popularity.append(retrieved_popularity)

        processed_queries += 1
        if processed_queries % 500 == 0:
            print(f"Processed {processed_queries}/{total_queries} queries")

    # Compute accuracy metrics
    accuracy_metrics = {
        'Precision@10': np.mean(precisions) if precisions else 0,
        'Recall@10': np.mean(recalls) if recalls else 0,
        'NDCG@10': np.mean(ndcgs) if ndcgs else 0,
        'MRR': np.mean(mrrs) if mrrs else 0
    }

    # Compute beyond-accuracy metrics
    coverage = compute_cov_at_n(all_retrieved_ids, catalog_df, N)
    tag_diversity = compute_div_at_n(all_retrieved_tags, N)
    genre_diversity = compute_genre_diversity(all_retrieved_genres, N)
    popularity_diversity = compute_popularity_diversity(all_retrieved_popularity, N)
    avg_popularity = compute_avg_pop_at_n(all_retrieved_popularity, N)

    metrics = {
        **accuracy_metrics,
        'Coverage@10': coverage,
        'Tag Diversity@10': tag_diversity,
        'Genre Diversity@10': genre_diversity,
        'Popularity Diversity@10': popularity_diversity,
        'AvgPop@10': avg_popularity
    }

    return metrics

# ================================
# 17. Initialize Feature Matrices and Retrieval Systems
# ================================

# Initialize feature matrices for retrieval functions that require them
feature_matrices = {
    'TF-IDF Retrieval': tfidf_matrix,
    'BERT Retrieval': bert_matrix,
    'MFCC Retrieval': mfcc_bow_matrix,
    'Spectral Contrast Retrieval': spectral_contrast_matrix,
    'VGG19 Retrieval': vgg19_matrix,
    'ResNet Retrieval': resnet_matrix,
    'Tag-Based Retrieval': tag_matrix,  # New
    'Early Fusion TF-IDF+BERT Retrieval': hstack([tfidf_matrix, bert_matrix]).tocsr(),  # New
    # 'Late Fusion MFCC+VGG19 Retrieval' does not require a separate feature matrix as it uses existing ones
}

# Define all retrieval systems
retrieval_systems = {
    'Random Retrieval': random_retrieval,
    'Tag-Based Retrieval': tag_based_retrieval_wrapper,  # New
    'Early Fusion TF-IDF+BERT Retrieval': early_fusion_retrieval_wrapper,  # New
    'Late Fusion MFCC+VGG19 Retrieval': late_fusion_retrieval_wrapper,  # New
    'TF-IDF Retrieval': tfidf_retrieval,
    'BERT Retrieval': bert_retrieval,
    'MFCC Retrieval': mfcc_retrieval,
    'Spectral Contrast Retrieval': spectral_contrast_retrieval,
    'VGG19 Retrieval': vgg19_retrieval,
    'ResNet Retrieval': resnet_retrieval
}

# ================================
# 18. Prepare Track IDs and Index Mapping
# ================================

track_ids = catalog_df['id'].tolist()
id_to_index = {track_id: idx for idx, track_id in enumerate(track_ids)}

# ================================
# 19. Initialize a Dictionary to Store Results
# ================================

results = {}

In [6]:
# ================================
# 20. Evaluate All Retrieval Systems
# ================================

print("\nStarting Evaluation of Retrieval Systems...\n")

for name, func in retrieval_systems.items():
    print(f"Evaluating {name}...")
    if name == 'Random Retrieval':
        # Random Retrieval does not require a feature matrix
        metrics = evaluate_retrieval_system(
            catalog_df=catalog_df,
            track_ids=track_ids,
            id_to_index=id_to_index,
            retrieval_function=func,
            feature_matrix=None,
            N=10
        )
    elif name == 'Early Fusion TF-IDF+BERT Retrieval':
        # Early Fusion uses the combined feature matrix
        feature_matrix = feature_matrices.get(name)
        metrics = evaluate_retrieval_system(
            catalog_df=catalog_df,
            track_ids=track_ids,
            id_to_index=id_to_index,
            retrieval_function=func,
            feature_matrix=feature_matrix,
            N=10
        )
    elif name == 'Late Fusion MFCC+VGG19 Retrieval':
        # Late Fusion handles its own feature matrices internally
        metrics = evaluate_retrieval_system(
            catalog_df=catalog_df,
            track_ids=track_ids,
            id_to_index=id_to_index,
            retrieval_function=func,
            feature_matrix=None,  # Feature matrices are handled inside the retrieval function
            N=10
        )
    else:
        # Other retrieval systems that require a feature matrix
        feature_matrix = feature_matrices.get(name)
        if feature_matrix is None:
            print(f"Error: Feature matrix for '{name}' not found.")
            continue
        metrics = evaluate_retrieval_system(
            catalog_df=catalog_df,
            track_ids=track_ids,
            id_to_index=id_to_index,
            retrieval_function=func,
            feature_matrix=feature_matrix,
            N=10
        )
    results[name] = metrics
    print(f"Metrics for {name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    print("-" * 50)

# ================================
# 21. Compile and View Results
# ================================

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print("\nFinal Evaluation Results:")
print(results_df)


Starting Evaluation of Retrieval Systems...

Evaluating Random Retrieval...
Processed 500/5148 queries
Processed 1000/5148 queries
Processed 1500/5148 queries
Processed 2000/5148 queries
Processed 2500/5148 queries
Processed 3000/5148 queries
Processed 3500/5148 queries
Processed 4000/5148 queries
Processed 4500/5148 queries
Processed 5000/5148 queries
Metrics for Random Retrieval:
  Precision@10: 0.0399
  Recall@10: 0.0020
  NDCG@10: 0.0398
  MRR: 0.0943
  Coverage@10: 100.0000
  Tag Diversity@10: 9.9600
  Genre Diversity@10: 9.8419
  Popularity Diversity@10: 187.7583
  AvgPop@10: 34.9948
--------------------------------------------------
Evaluating Tag-Based Retrieval...
Processed 500/5148 queries
Processed 1000/5148 queries
Processed 1500/5148 queries
Processed 2000/5148 queries
Processed 2500/5148 queries
Processed 3000/5148 queries
Processed 3500/5148 queries
Processed 4000/5148 queries
Processed 4500/5148 queries
Processed 5000/5148 queries
Metrics for Tag-Based Retrieval:
  Pre

In [15]:
# ================================
# 22. Visualization of Results
# ================================

import os
import matplotlib.pyplot as plt
import seaborn as sns

# Configure Seaborn aesthetics
sns.set_style("whitegrid")
sns.set_context("talk")

# Define plot configurations
plot_configs = [
    {
        'x': 'Coverage@10',
        'y': 'NDCG@10',
        'title': 'NDCG@10 vs Coverage@10',
        'xlabel': 'Coverage@10 (%)',
        'ylabel': 'NDCG@10',
        'filename': 'ndcg_vs_coverage.png'
    },
    {
        'x': 'Tag Diversity@10',
        'y': 'NDCG@10',
        'title': 'NDCG@10 vs Tag Diversity@10',
        'xlabel': 'Tag Diversity@10',
        'ylabel': 'NDCG@10',
        'filename': 'ndcg_vs_tag_diversity.png'
    },
    {
        'x': 'Genre Diversity@10',
        'y': 'NDCG@10',
        'title': 'NDCG@10 vs Genre Diversity@10',
        'xlabel': 'Genre Diversity@10',
        'ylabel': 'NDCG@10',
        'filename': 'ndcg_vs_genre_diversity.png'
    },
    {
        'x': 'Popularity Diversity@10',
        'y': 'NDCG@10',
        'title': 'NDCG@10 vs Popularity Diversity@10',
        'xlabel': 'Popularity Diversity@10',
        'ylabel': 'NDCG@10',
        'filename': 'ndcg_vs_popularity_diversity.png'
    }
]

# Ensure the 'plots' directory exists
os.makedirs('plots', exist_ok=True)

# Generate and save each plot
for config in plot_configs:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        data=results_reset,
        x=config['x'],
        y=config['y'],
        hue='Retrieval_System',
        s=100,
        palette='deep'
    )
    plt.title(config['title'], fontsize=16)
    plt.xlabel(config['xlabel'], fontsize=14)
    plt.ylabel(config['ylabel'], fontsize=14)
    plt.legend(title='IR System', fontsize=12, title_fontsize=14, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(f"plots/{config['filename']}", dpi=300, bbox_inches='tight')
    plt.close()
