In [14]:
'''
Processing datasets. 
@author: Avi Jain
Based off NCF
'''
import scipy.sparse as sp
import numpy as np
import pandas as pd
import os
import json
import sys



class Dataset(object):
    '''
    classdocs
    '''

    def __init__(self, path):
        '''
        Constructor
        '''
        self.trainMatrix = self.load_rating_file_as_matrix(path)
        self.playlistMatrix = self.get_playlist_dict(path)
        self.num_playlists, self.num_tracks = self.trainMatrix.shape
        
    def get_playlist_dict(self,filename):
        
        json_filepath = os.path.join(filename)
        data = json.load(open(json_filepath))
        i = 0
        j = 0
        
        playlists = data['playlists']
        playlist_matrix = {}
        pid_to_playlist_name = {}
        track_uri_to_id = {}
        id_to_track_uri = {}
        
        for playlist in playlists:
            name = playlist['name']
            tracks = playlist['tracks']
            pid = playlist['pid']
            pid_to_playlist_name[pid] = name

            for track in tracks:
                track_uri = track['track_uri']
                #can add name and album later if needed
                if not track_uri in track_uri_to_id:
                    track_uri_to_id[track_uri]=i
                    id_to_track_uri[i]=track_uri
                    i+=1
                playlist_matrix[(j,track_uri_to_id[track_uri])] = 1.0
            j+=1
        return playlist_matrix
    
    def load_rating_file_as_list(self, filename):
        ratingList = []
        with open(filename, "r") as f:
            user, item = int(arr[0]), int(arr[1])
            ratingList.append([user, item])
        return ratingList
    
    def load_negative_file(self, filename):
        negativeList = []
        with open(filename, "r") as f:
            negatives = []
            for x in arr[1: ]:
                negatives.append(int(x))
            negativeList.append(negatives)
        return negativeList
    
    def load_rating_file_as_matrix(self, filename):
        '''
        Read file and Return dok matrix.
        '''
        json_filepath = os.path.join(filename)
        data = json.load(open(json_filepath))
        playlists= data['playlists']
        # Get number of users and items - hardcoded for now
        num_playlists, num_tracks = 1000, 34443
        track_uri_to_id = {}
        id_to_track_uri = {}
        pid_to_playlist_name = {}
        
        i = 0
        j = 0
        
        # Construct matrix
        mat = sp.dok_matrix((num_playlists+1, num_tracks+1), dtype=np.float32)
        for playlist in playlists:
            name = playlist['name']
            tracks = playlist['tracks']
            pid = playlist['pid']

            pid_to_playlist_name[pid] = name

            track_uri_list=[]

            for track in tracks:
                track_uri = track['track_uri']
                #can add name and album later if needed
                if not track_uri in track_uri_to_id:
                    track_uri_to_id[track_uri]=i
                    id_to_track_uri[i]=track_uri
                    i+=1

                playlist_id, track_id = j,track_uri_to_id[track_uri]
                mat[playlist_id, track_id] = 1.0 
            j+=1
        return mat
    


In [15]:
dataset = Dataset('mpd.slice.0-999.json')
train = dataset.trainMatrix
mat = dataset.playlistMatrix
num_users, num_items = train.shape

In [20]:
print()
print(len(mat))
print(num_users,num_items)

{(94, 4219): 1.0, (17, 979): 1.0, (331, 13828): 1.0, (104, 5479): 1.0, (481, 20172): 1.0, (792, 28958): 1.0, (965, 3542): 1.0, (26, 931): 1.0, (105, 5577): 1.0, (134, 350): 1.0, (483, 20281): 1.0, (946, 28275): 1.0, (130, 6777): 1.0, (262, 12086): 1.0, (955, 2063): 1.0, (454, 10010): 1.0, (456, 18970): 1.0, (272, 5569): 1.0, (751, 1046): 1.0, (294, 7527): 1.0, (381, 14077): 1.0, (452, 1064): 1.0, (968, 18113): 1.0, (380, 311): 1.0, (182, 8986): 1.0, (584, 8020): 1.0, (978, 33872): 1.0, (212, 10057): 1.0, (673, 25801): 1.0, (942, 1059): 1.0, (730, 8880): 1.0, (442, 6514): 1.0, (634, 6173): 1.0, (41, 2156): 1.0, (523, 21261): 1.0, (801, 29040): 1.0, (960, 33402): 1.0, (270, 12517): 1.0, (648, 15680): 1.0, (656, 25154): 1.0, (880, 3080): 1.0, (334, 4422): 1.0, (586, 7384): 1.0, (548, 22083): 1.0, (395, 11475): 1.0, (811, 29453): 1.0, (146, 7712): 1.0, (522, 21253): 1.0, (243, 11497): 1.0, (37, 1957): 1.0, (704, 10639): 1.0, (462, 3579): 1.0, (223, 6703): 1.0, (133, 6963): 1.0, (803, 2857)