### Imports

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy import sparse
from surprise import Dataset, Reader, SVD
from collections import Counter

In [9]:
df = pd.read_csv("training_data/data.csv")
df

Unnamed: 0,user_id,song_id,listen_count,timestamp,interaction,genres,hashtag
0,16,170,3,2024-04-20 22:54:02.414892,1,['Blues'],['#LostAndFound']
1,23,78,1,2024-08-07 12:14:02.414906,1,['House'],"['#Relatable', '#LifeFeels', '#OnMyMind', '#Ju..."
2,17,377,3,2024-02-04 07:28:02.414911,-1,"['Classical', 'Alternative']","['#FeelingsMatter', '#OnMyMind', '#Relatable',..."
3,18,267,1,2024-09-28 10:37:02.414914,-1,"['Disco', 'Metal']","['#Emotional', '#MusicMood', '#Heartbreak', '#..."
4,91,445,1,2024-12-22 23:11:02.414918,-1,"['Reggae', 'Blues', 'World']","['#SelfReflection', '#LifeFeels', '#DeepThough..."
...,...,...,...,...,...,...,...
4486,5,239,2,2024-05-03 08:46:02.430842,-1,"['R&B', 'Dancehall', 'Blues', 'Opera', 'Trance...","['#BrokenHeart', '#LateNightVibes']"
4487,92,473,0,2024-07-11 08:53:02.430848,-1,['Folk'],"['#Dreamy', '#LostInThought']"
4488,17,248,1,2024-07-16 04:17:02.430852,1,['Funk'],['#LifeFeels']
4489,36,206,1,2024-03-16 14:59:02.430855,1,"['Folk', 'Acoustic', 'Rock', 'Ambient']",['#StoryOfMyLife']


### Preprocessing

In [15]:
import ast 
from collections import Counter

genres = df["genres"].apply(ast.literal_eval)
genre_freq = Counter(genre for song in genres for genre in song)

genre_freq_df = pd.DataFrame([genre_freq]).T.reset_index()
genre_freq_df.columns = ['genre','count']

# plt.title("Genres and Songs Mapped")
# sns.barplot(x='genre',y='count', data=genre_freq_df.sort_values(by="count", ascending=False))
# plt.xticks(rotation=90)


### Model

In [18]:
cosine_df = df[['song_id','listen_count','interaction','genres']]
cosine_df

Unnamed: 0,song_id,listen_count,interaction,genres
0,170,3,1,['Blues']
1,78,1,1,['House']
2,377,3,-1,"['Classical', 'Alternative']"
3,267,1,-1,"['Disco', 'Metal']"
4,445,1,-1,"['Reggae', 'Blues', 'World']"
...,...,...,...,...
4486,239,2,-1,"['R&B', 'Dancehall', 'Blues', 'Opera', 'Trance..."
4487,473,0,-1,['Folk']
4488,248,1,1,['Funk']
4489,206,1,1,"['Folk', 'Acoustic', 'Rock', 'Ambient']"


In [153]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import ast
from collections import Counter


class Recommender:
    def __init__(self, df):
        self.df = df.copy()  
        self.df = self._aggregate_song_data(df)
        self._prepare_features()

    def _safe_literal_eval(self, value):
        """Safely apply literal_eval to strings."""
        if isinstance(value, str):
            try:
                return ast.literal_eval(value)
            except (ValueError, SyntaxError):
                # If it fails, return a list with the string itself
                return [value]
        return value  


    def _aggregate_song_data(self, df):

       
       # Group by song_id and aggregate
        aggregated = df.groupby('song_id').agg({
            'listen_count': 'sum',  
            'interaction': lambda x: list(x),  
            'genres': 'first',  # Genres should be the same for each song
        }).reset_index()
        
        # Calculate interaction statistics
        aggregated['interaction_stats'] = aggregated['interaction'].apply(
            lambda x: {
                'positive': sum(1 for i in x if i == 1),
                'negative': sum(1 for i in x if i == -1),
                'total': len(x)
            }
        )
        return aggregated
    
    def _prepare_features(self):
        self.df["genres"] = self.df["genres"].apply(self._safe_literal_eval)

        
        # genre processing
        mlb = MultiLabelBinarizer()
        print("ATTENTION:", type(self.df["genres"]))
        genre_matrix = mlb.fit_transform(self.df["genres"])
    
        genre_df = pd.DataFrame(
            genre_matrix,
            columns=mlb.classes_,
            index=self.df.index  # ensure this to align with the original df index
        )
        def adjust_interactions(interactions):
            return [1 if x == 1 else -1 for x in interactions]
        
        self.df['adjusted_interaction'] = self.df['interaction'].apply(adjust_interactions)
        df_exploded = self.df.explode('adjusted_interaction')
        interactions_matrix = pd.get_dummies(df_exploded['adjusted_interaction'], prefix='interaction')
        interactions_matrix = interactions_matrix.groupby(df_exploded.index).sum()  # ensure it has the same length as the original df
    
        
        # fill the missing listen count values with 0
        self.df['listen_count'] = self.df['listen_count'].fillna(0)
        
        # using MinMax scaling for all feature matrices -- ranges 0 to 1
        scaler = MinMaxScaler()
        genre_scaled = scaler.fit_transform(genre_df)
        interaction_scaled = scaler.fit_transform(interactions_matrix)
        listen_count_scaled = scaler.fit_transform(self.df[['listen_count']].values)
        
        # ensure all feature matrices have the same number of rows
        assert genre_scaled.shape[0] == interaction_scaled.shape[0] == listen_count_scaled.shape[0], "row mismatch!"
    
        # combine all features into a single feature matrix
        self.feature_matrix = np.hstack([
            genre_scaled * 0.4,  
            interaction_scaled * 0.4, 
            listen_count_scaled * 0.2  
        ])
        
        # store song IDs for reference
        self.song_ids = self.df["song_id"].values

    def get_similar_songs(self, target_song_id, top_n=5):
        # find the target song index
        target_idx = np.where(self.song_ids == target_song_id)[0]
        if len(target_idx) == 0:
            raise ValueError(f" The song: {target_song_id} not found in dataset.")
        target_idx = target_idx[0]
        
        # calculate similarities
        target_features = self.feature_matrix[target_idx].reshape(1, -1)
        similarities = cosine_similarity(target_features, self.feature_matrix)[0]
        
        # create a mask for songs to exclude (including the target song)
        mask = np.ones_like(similarities, dtype=bool)
        mask[target_idx] = False  # exclude the target song
        
        # get the top N similar songs
        similar_indices = np.argsort(similarities[mask])[::-1][:top_n]
        
        # map back to the actual indices
        actual_indices = np.arange(len(similarities))[mask][similar_indices]
        
        # return the song IDs and their similarity scores
        result_songs = self.song_ids[actual_indices]
        result_scores = similarities[actual_indices]
        
        return result_songs, result_scores

recommender = Recommender(cosine_df)

similar_songs, similarity_scores = recommender.get_similar_songs(5, top_n=5)

for song_id, score in zip(similar_songs, similarity_scores):
    song_info = df[df['song_id'] == song_id].iloc[0]
    print(f"Song ID: {song_id}, Similarity Score: {score:.3f}")
    print(f"Genres: {song_info['genres']}, Interactions:{song_info['interaction']}")


ATTENTION: <class 'pandas.core.series.Series'>
Song ID: 340, Similarity Score: 0.709
Genres: 187     ['Reggae', 'Classical', 'Blues', 'K-Pop']
379     ['Reggae', 'Classical', 'Blues', 'K-Pop']
1120    ['Reggae', 'Classical', 'Blues', 'K-Pop']
1361    ['Reggae', 'Classical', 'Blues', 'K-Pop']
2087    ['Reggae', 'Classical', 'Blues', 'K-Pop']
2093    ['Reggae', 'Classical', 'Blues', 'K-Pop']
2140    ['Reggae', 'Classical', 'Blues', 'K-Pop']
2631    ['Reggae', 'Classical', 'Blues', 'K-Pop']
2838    ['Reggae', 'Classical', 'Blues', 'K-Pop']
3812    ['Reggae', 'Classical', 'Blues', 'K-Pop']
Name: genres, dtype: object, Interactions:187    -1
379    -1
1120   -1
1361    1
2087   -1
2093   -1
2140    1
2631    1
2838    1
3812   -1
Name: interaction, dtype: int64
Song ID: 312, Similarity Score: 0.708
Genres: 616     ['Blues']
1454    ['Blues']
3180    ['Blues']
Name: genres, dtype: object, Interactions:616     1
1454   -1
3180   -1
Name: interaction, dtype: int64
Song ID: 89, Similarity Score