# Song Recommendation System

## Initialization

In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.neighbors import NearestNeighbors
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
from mlxtend.frequent_patterns import fpgrowth
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [6]:
file_path_songs = "Data/Music Info.csv"
df_songs = pd.read_csv(file_path_songs)
file_path_users = "Data/User Listening History.csv"
df_users = pd.read_csv(file_path_users)

print(df_users.head())

             track_id                                   user_id  playcount
0  TRIRLYL128F42539D1  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
1  TRFUPBA128F934F7E1  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
2  TRLQPQJ128F42AA94F  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
3  TRTUCUY128F92E1D24  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1
4  TRHDDQG12903CB53EE  b80344d063b5ccb3212f76538f3d9e43d87dca9e          1


## Data Inspection

In [None]:
selected_columns = ['duration_ms', 'danceability', 'energy', 'key',
                    'loudness', 'mode', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 
                    'tempo', 'time_signature']

# Extract the relevant data from the DataFrame
music_data = df_songs[selected_columns]

# Calculate the correlation matrix
correlation_matrix = music_data.corr()

# Plot the correlation matrix using seaborn
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Music Features')
plt.show()

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(df_songs.sample(n = 10000, random_state = 1).isnull(), cmap  = 'viridis', cbar = False, yticklabels = False,
            # xticklabels= False
            )
plt.show()

I believe this explains why we could use the neural network to predict genre for those observations where the attribute is missing.

## Clustering: Cluster the user base

Maybe we need to cluster the users otherwise we have too many items (songs) when using one hot encoding on the baskets later. We could try cluster on different parameters and see what minimizes the number of items in each basket.

## Frequent Itemsets: Market basket

In [None]:
# Group by user_id and aggregate track_ids into a basket
df_basket = df_users.groupby('user_id')['track_id'].apply(list).reset_index()

# Filter out baskets with fewer than 50 items
df_basket = df_basket[df_basket['track_id'].apply(len) > 50]

# Rename the column
df_basket.rename(columns={'track_id': 'basket'}, inplace=True)

print(df_basket)

In [None]:
track_freq = df_users['track_id'].value_counts().reset_index()
track_freq.columns = ['track_id', 'frequency']

track_freq = track_freq.merge(df_songs, on='track_id')

top_15_tracks = track_freq.head(15)

plt.figure(figsize=(10, 6))
plt.bar(top_15_tracks['name'], top_15_tracks['frequency'])
plt.xlabel('Track Name')
plt.ylabel('Frequency')
plt.title('Top 15 Most Heard Songs')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Reduce sampling overhead
# Instead of using a fraction, use a fixed sample size to ensure stability in runtime
sample_size = 10  # Adjust sample size based on available memory and time
df_basket_sample = df_basket.sample(n=sample_size, random_state=42)

# Step 2: Use efficient one-hot encoding
# Use SparseDataFrame for memory-efficient encoding
mlb = MultiLabelBinarizer(sparse_output=True)  # Enables sparse matrix output
one_hot_sparse = mlb.fit_transform(df_basket_sample['basket'])
one_hot = pd.DataFrame.sparse.from_spmatrix(one_hot_sparse, columns=mlb.classes_)

# Step 3: Compute frequent itemsets using FP-Growth
# Reduce support level if too few frequent itemsets are found
support_level = 0.01
frequent_itemsets = fpgrowth(one_hot, min_support=support_level, use_colnames=True)

# Display frequent itemsets
print(frequent_itemsets)

# Step 4: Generate association rules
confidence_level = 0.1
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence_level)

# Step 5: Display results
print(f"Frequent Itemsets with support >= {support_level:.2%}:\n{frequent_itemsets}")
print(f"\nAssociation Rules with confidence >= {confidence_level:.2%}:\n{rules}")