In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
from scipy.sparse import csr_matrix, vstack
from scipy.sparse.linalg import svds

In [2]:
# We just take the first playlist as the playlist of the user that we are giving recommendations to
USER_CHOSEN_PLAYLIST = 0

In [3]:
# Create an empty list to hold all the playlists from both files
all_playlists = []

# Read the first JSON file and extract the "playlists" array (This will be users's playlist for example, for a real user, this would be replaced with his/her playlist) 
with open('Raw Data/mpd.slice.0-999.json', 'r') as f:
    data = json.load(f)
    playlists = data['playlists']
    
# Append each playlist in the "playlists" array to the list created in step 1
all_playlists.extend(playlists)

counter = 1
# Loop through all the second JSON files
for filename in os.listdir('Raw Data'):
    if counter == 30:
        break
    if filename.endswith('.json'):
        # Read the second JSON file and extract the "playlists" array
        with open(os.path.join('Raw Data', filename), 'r') as f:
            data = json.load(f)
            playlists = data['playlists']
        
        # Append each playlist in the "playlists" array to the list created in step 1
        all_playlists.extend(playlists)
        counter += 1

# Create a new dictionary to hold the combined playlist data
combined_data = {
    'info': data['info'],  # Use the "info" object from the last file read
    'playlists': all_playlists
}

# Write the combined playlist data to a new JSON file
with open('combined.json', 'w') as f:
    json.dump(combined_data, f)

FileNotFoundError: [Errno 2] No such file or directory: 'Raw Data/mpd.slice.0-999.json'

In [None]:
with open('combined.json', 'r') as f:
    data = json.load(f)
    playlists = data['playlists']

In [None]:
playlists[0]

{'name': 'Throwbacks',
 'collaborative': 'false',
 'pid': 0,
 'modified_at': 1493424000,
 'num_tracks': 52,
 'num_albums': 47,
 'num_followers': 1,
 'tracks': [{'pos': 0,
   'artist_name': 'Missy Elliott',
   'track_uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
   'artist_uri': 'spotify:artist:2wIVse2owClT7go1WT98tk',
   'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)',
   'album_uri': 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K',
   'duration_ms': 226863,
   'album_name': 'The Cookbook'},
  {'pos': 1,
   'artist_name': 'Britney Spears',
   'track_uri': 'spotify:track:6I9VzXrHxO9rA9A5euc8Ak',
   'artist_uri': 'spotify:artist:26dSoYclwsYLMAKD3tpOr4',
   'track_name': 'Toxic',
   'album_uri': 'spotify:album:0z7pVBGOD7HCIB7S8eLkLI',
   'duration_ms': 198800,
   'album_name': 'In The Zone'},
  {'pos': 2,
   'artist_name': 'Beyoncé',
   'track_uri': 'spotify:track:0WqIKmW4BTrj3eJFmnCKMv',
   'artist_uri': 'spotify:artist:6vWDO969PvNqNYHIOW5v0m',
   'track_name': 'Crazy In Love',
   'alb

In [None]:
# Step 1: Data Collection and Preprocessing
def load_and_preprocess_data(playlists, min_tracks_per_playlist=5, min_track_frequency=10):

    # Filter out irrelevant information
    for playlist in playlists:
        playlist.pop('modified_at', None)
        for track in playlist['tracks']:
            track.pop('album_name', None)
            track.pop('duration_ms', None)
            track.pop('album_uri', None)
            track.pop('artist_uri', None)

    # Filter playlists with fewer tracks than the minimum threshold
    playlists = [playlist for playlist in playlists if len(playlist['tracks']) >= min_tracks_per_playlist]

    # Calculate the frequency of each track in the dataset
    track_frequencies = {}
    for playlist in playlists:
        for track in playlist['tracks']:
            track_uri = track['track_uri']
            if track_uri not in track_frequencies:
                track_frequencies[track_uri] = 0
            track_frequencies[track_uri] += 1

    # Filter out tracks with a frequency lower than the minimum threshold
    for playlist in playlists:
        playlist['tracks'] = [track for track in playlist['tracks'] if track_frequencies[track['track_uri']] >= min_track_frequency]

    return playlists

In [None]:
def create_feature_matrix(data):
    processed_data = []
    for playlist in data:
        pid = playlist['pid']
        for track in playlist['tracks']:
            track_data = {
                'user_id': pid,
                'track': track['track_uri'],
            }
            processed_data.append(track_data)

    df = pd.DataFrame(processed_data)
    matrix = pd.crosstab(df['user_id'], df['track'])

    return matrix


In [None]:
def compute_cosine_similarity(feature_matrix):
    return cosine_similarity(feature_matrix)

# Model based approach

In [None]:
filtered_playlists = load_and_preprocess_data(playlists, min_tracks_per_playlist=5, min_track_frequency=10)
feature_matrix = create_feature_matrix(filtered_playlists)

In [None]:
feature_matrix

track,spotify:track:000xQL6tZNLJzIrtIgxqSl,spotify:track:00AxNl4D4jHL2AEf1W55j5,spotify:track:00BnfL75e8vHSGCmwUWbEk,spotify:track:00BuKLSAFkaEkaVAgIMbeA,spotify:track:00Ci0EXS4fNPnkTbS6wkOh,spotify:track:00LfFm08VWeZwB0Zlm24AT,spotify:track:00MI0oGDVJYM1qWbyUOIhH,spotify:track:00NUqFMIpCsrYPbM9YpVHQ,spotify:track:00YeYHw6zhoy1y7EPTgIkj,spotify:track:00fNdIFKoMxxt8Hnm2kAKL,...,spotify:track:7zWj09xkFgA9tcV6YhfU6q,spotify:track:7zXfy6kN7tOn3jxNRX1jW1,spotify:track:7zbq8RT5Kd3ExOGVTiUQbR,spotify:track:7zeCIWu37bKFJuem8MTyM1,spotify:track:7zkLpY72g6lKQbiHDqri1S,spotify:track:7zmJyZL3DfYBBYbY8Rve6W,spotify:track:7zrPswSV1yxrill5OyCuaU,spotify:track:7zsw78LtXUD7JfEwH64HK2,spotify:track:7zuwaenG5AF0vG7o7kMduX,spotify:track:7zxRMhXxJMQCeDDg0rKAVo
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Convert the utility matrix to a sparse CSR matrix
sparse_utility_matrix = csr_matrix(feature_matrix)

# apply the svds function to perform truncated SVD:
sparse_utility_matrix = sparse_utility_matrix.astype(np.float32)
U, sigma, Vt = svds(sparse_utility_matrix, k =15)

# Since sigma is returned as a 1D array, convert it to a diagonal matrix
sigma_matrix = np.diag(sigma)

# Reconstruct the utility matrix
reconstructed_utility_matrix = np.dot(np.dot(U, sigma_matrix), Vt)

# Values are very small, use apply minmaxscaler
reconstructed_utility_matrix_scaled = (reconstructed_utility_matrix - reconstructed_utility_matrix.min()) / (reconstructed_utility_matrix.max() - reconstructed_utility_matrix.min())

In [11]:
pid = USER_CHOSEN_PLAYLIST
num_recommendations = 20

original_row = feature_matrix.loc[pid]
reconstructed_row = reconstructed_utility_matrix_scaled[pid]

recommendations = []
for track, original_presence in zip(original_row.index, original_row):
    if original_presence == 0:  # We only consider tracks not already in the playlist
        # Getting track name, similarity matrix
        recommendations.append((track, reconstructed_row[feature_matrix.columns.get_loc(track)]))

sorted_recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
recommended_track_uris = [track_uri for track_uri, _ in sorted_recommendations[:num_recommendations]]
print(recommended_track_uris)

['spotify:track:6RcQOut9fWL6FSqeIr5M1r', 'spotify:track:5i66xrvSh1MjjyDd6zcwgj', 'spotify:track:1D066zixBwqFYqBhKgdPzp', 'spotify:track:3ZFTkvIE7kyPt6Nu3PEa7V', 'spotify:track:5dNfHmqgr128gMY2tc5CeJ', 'spotify:track:2q4rjDy9WhaN3o9MvDbO21', 'spotify:track:5XJJdNPkwmbUwE79gv0NxK', 'spotify:track:5OMwQFBcte0aWFJFqrr5oj', 'spotify:track:0CAfXk7DXMnon4gLudAp7J', 'spotify:track:1hGy2eLcmC8eKx7qr1tOqx', 'spotify:track:4gFXY5yvHayGckJndafYQE', 'spotify:track:0O45fw2L5vsWpdsOdXwNAR', 'spotify:track:2CvOqDpQIMw69cCzWqr5yr', 'spotify:track:7uKcScNXuO3MWw6LowBjW1', 'spotify:track:6cmm1LMvZdB5zsCwX5BjqE', 'spotify:track:1QV6tiMFM6fSOKOGLMHYYg', 'spotify:track:2CEgGE6aESpnmtfiZwYlbV', 'spotify:track:66TRwr5uJwPt15mfFkzhbi', 'spotify:track:30VrBsh1STRBoIrhQOAwzK', 'spotify:track:3f7gYMirBEKuc57218BjOY']


# Dictionary of all songs with keys being track uri

In [12]:
all_tracks = {}

for playlist in playlists:
    for track in playlist['tracks']:
        all_tracks[track['track_uri']] = [track['artist_name'], track['track_name'], playlist['name']]


# {'name': 'Throwbacks',
#  'collaborative': 'false',
#  'pid': 0,
#  'modified_at': 1493424000,
#  'num_tracks': 52,
#  'num_albums': 47,
#  'num_followers': 1,
#  'tracks': [{'pos': 0,
#    'artist_name': 'Missy Elliott',
#    'track_uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
#    'artist_uri': 'spotify:artist:2wIVse2owClT7go1WT98tk',
#    'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)',
#    'album_uri': 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K',
#    'duration_ms': 226863,
#    'album_name': 'The Cookbook'},        

In [13]:
recommended_songs_names = []
for recommended_track in recommended_track_uris:
    recommended_songs_names.append(all_tracks[recommended_track][1] + ' by ' + all_tracks[recommended_track][0])
recommended_songs_names

['Hollaback Girl by Gwen Stefani',
 'Umbrella by Rihanna',
 'Fergalicious by Fergie',
 "Hips Don't Lie by Shakira",
 'Ignition - Remix by R. Kelly',
 'Kiss Me Thru The Phone by Soulja Boy',
 'Gold Digger by Kanye West',
 'TiK ToK by Kesha',
 'Low (feat T-Pain) - Feat T-Pain Album Version by Flo Rida',
 'Beautiful Girls by Sean Kingston',
 "Buy U a Drank (Shawty Snappin') by T-Pain",
 'SexyBack by Justin Timberlake',
 'Halo by Beyoncé',
 'One, Two Step by Ciara',
 'Down by Jay Sean',
 'Poker Face by Lady Gaga',
 'Dynamite by Taio Cruz',
 'Crank That (Soulja Boy) by Soulja Boy',
 'Disturbia by Rihanna',
 'California Gurls - feat. Snoop Dogg by Katy Perry']

In [14]:
print('Playlist Name: ' , playlists[0]['name'])
print('#####################################################')

tester_playlist_track_names = []
for track in playlists[0]['tracks']:
    print(track['artist_name'] + ' by ' + track['track_name'])

Playlist Name:  Throwbacks
#####################################################
Missy Elliott by Lose Control (feat. Ciara & Fat Man Scoop)
Britney Spears by Toxic
Beyoncé by Crazy In Love
Justin Timberlake by Rock Your Body
Shaggy by It Wasn't Me
Usher by Yeah!
Usher by My Boo
The Pussycat Dolls by Buttons
Destiny's Child by Say My Name
OutKast by Hey Ya! - Radio Mix / Club Mix
Nelly Furtado by Promiscuous
Jesse McCartney by Right Where You Want Me - Radio Edit Version
Jesse McCartney by Beautiful Soul
Jesse McCartney by Leavin'
Cassie by Me & U
Omarion by Ice Box
Avril Lavigne by Sk8er Boi
Chris Brown by Run It!
Beyoncé by Check On It - feat. Bun B and Slim Thug
Destiny's Child by Jumpin', Jumpin'
Sheryl Crow by Soak Up The Sun
The Black Eyed Peas by Where Is The Love?
Bowling For Soup by Stacy's Mom
The Click Five by Just The Girl
Chris Brown by Yo (Excuse Me Miss)
Jonas Brothers by Year 3000
Lil Mama by Lip Gloss
Cascada by Everytime We Touch - Radio Edit
Jason Derulo by Whatcha S