In [1]:
from collections import defaultdict
import math
import random
import numpy as np
import pandas as pd

In [2]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
import os

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
spark_fp = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "sampled_users.csv")
spark_fp

'/Volumes/Marceline Jr./Spotify Dataset/sampled_users.csv'

In [5]:
df = spark.read.option("header", "true").csv(spark_fp)

In [6]:
users = df.toPandas()

In [7]:
tf_path_one = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000000.csv")
tf_path_two = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000001.csv")

In [8]:
track_features_one = pd.read_csv(tf_path_one)
track_features_two = pd.read_csv(tf_path_two)

In [9]:
track_features = pd.concat([track_features_one, track_features_two])

In [10]:
userFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')
nonModifiedFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')

In [11]:
userFeatures['mode'] = userFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)

In [12]:
userFeatures.columns

Index(['session_id', 'session_position', 'session_length', 'track_id_clean',
       'skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'date', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end',
       'track_id', 'duration', 'release_year', 'us_popularity_estimate',
       'acousticness', 'beat_strength', 'bounciness', 'danceability',
       'dyn_range_mean', 'energy', 'flatness', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mechanism', 'mode', 'organism', 'speechiness',
       'tempo', 'time_signature', 'valence', 'acoustic_vector_0',
       'acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3',
       'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6',
       'acoustic_vector

In [13]:
pd.set_option('display.max_columns', None)

In [16]:
class songRecommenderDraft():
    
    from collections import defaultdict
    import math
    import random
    import numpy as np
    import pandas as pd
    
    data = {}
    usersPerTrack = {}
    tracksPerUser = {}
    featureDict = {}
    
    def __init__(self, data, *args, **kwargs):
        '''
        Our constructor.
        features - set of Spotify Track Features.
        '''
        super().__init__(*args, **kwargs)
        
        if isinstance(data, pd.DataFrame):
            self.data = self.parseDataFrame(data)
        else:
            self.data = data
        
        self.generateDictionaries()
        
    def parseDataFrame(self, data):
        '''
        Takes in a DataFrame object and parses it into our desired format.
        We want the format of the data to be a list of dictionaries.
        Each dictionary should be as follows
        
        {'session_id': {userBehaviors},
        'track_id': {trackFeatures}}
        '''
        import json
        
        cleanData = []

        parsed = json.loads(data.to_json(orient = 'records'))

        for line in parsed:
            temp = {}

            featuresSet = ['session_id', 'session_position', 'session_length', 'track_id_clean',
               'skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch',
               'no_pause_before_play', 'short_pause_before_play',
               'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
               'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
               'hour_of_day', 'date', 'premium', 'context_type',
               'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end',
               'track_id', 'duration', 'release_year', 'us_popularity_estimate']
            #get only track features

            behaviorSet = ['acousticness', 'beat_strength', 'bounciness', 'danceability',
               'dyn_range_mean', 'energy', 'flatness', 'instrumentalness', 'key',
               'liveness', 'loudness', 'mechanism', 'mode', 'organism', 'speechiness',
               'tempo', 'time_signature', 'valence', 'acoustic_vector_0',
               'acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3',
               'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6',
               'acoustic_vector_7']
            #get only user behaviors

            featuresDict = {k:v for k,v in line.items() if k not in featuresSet}
            temp['behaviors'] = {k:v for k,v in line.items() if k not in behaviorSet}
            #set session_id as key, value as the user behaviors
            temp['features'] = featuresDict
            #set track_id as key, values as track features
            cleanData.append(temp)
            #append our dictionary to our list
        
        return cleanData
    
    def getData(self):
        '''
        Return our track features.
        '''
        return self.data
    
    def generateDictionaries(self, usersColumn = 'session_id', tracksColumn = 'track_id'):
        '''
        Generate our utility data structures for recommendation.
        usersPerTrack : key: track, value: the users who have listened to this track
        tracksPerUser : key: user, value: the tracks the user has listened to
        '''
        self.usersPerTrack = defaultdict(set) # key: track, value: the users who have listened to this track
        self.tracksPerUser = defaultdict(set) # key: user, value: the tracks the user has listened to
        self.featureDict = {} #to retrieve a specific song's features
        
        for d in self.data:
            user, track = d['behaviors'][str(usersColumn)], d['behaviors'][str(tracksColumn)]
            #get the user and track 
            self.usersPerTrack[track].add(user)
            self.tracksPerUser[user].add(track)
            self.featureDict[track] = d['features']
    
    def getUsersPerTrack(self):
        '''
        Return the users per each track.
        '''
        return self.usersPerTrack
    
    def getTracksPerUser(self):
        '''
        Return the tracks per each user.
        '''
        return self.tracksPerUser
    
    def getFeatureDict(self):
        '''
        Returns the features of each user.
        '''
        return self.featureDict        
    
    def CosineUser(self, u1, u2):
        '''
        This generates the cosine similarity between two users.
        '''
        # Between two users
        
        tracksPerUser = self.getTracksPerUser()
        featureDict = self.getFeatureDict()
        
        inter = tracksPerUser[u1].intersection(tracksPerUser[u2])
        numer = 0
        denom1 = 0
        denom2 = 0
        
        for t in inter:
            if t not in featureDict:
                numer = 0
                continue
            numer += sum([a * b for a, b in zip(featureDict[t].values(),featureDict[t].values())])
            #summation of the product of two users songs
        for t in tracksPerUser[u1]:
            if t not in featureDict:
                denom1 = 0
                continue
            denom1 += sum([l ** 2 for l in featureDict[t].values()])
            #summation of the squared values of one users song features
        for t in tracksPerUser[u2]:
            if t not in featureDict:
                denom2 = 0
                continue
            denom2 += sum([l ** 2 for l in featureDict[t].values()])
            #summation of the squared values of one users song features
        denom = math.sqrt(denom1) * math.sqrt(denom2)
        if denom == 0: return 0
        return numer / denom
    
    def mostSimilarUser(self, u, N):
        '''
        Our similarity function for users.
        '''
        
        tracksPerUser = self.getTracksPerUser()
        
        similarities = []
        for u2 in tracksPerUser:
            if u2 == u:
                continue
            sim = self.CosineUser(u, u2)
            similarities.append((sim,u2))
        similarities.sort(reverse = True)
        return similarities[:N]
        


In [110]:
s = songRecommender(userFeatures)

In [111]:
s.mostSimilarUser('37_5d831937-4681-4e68-a245-6168d16eee19', 10)

[(0.13135717234483252, '5_0dd87232-c564-493b-8e23-6fd2763ffe4c'),
 (0.12053354028027405, '14_d70eae71-5e55-4856-b275-14e8ba39b282'),
 (0.10125046246533033, '64_72cc0289-3cb6-47e0-8794-1655571a6922'),
 (0.07166377425017433, '5_1951099f-a089-4f52-8013-ae1187809e95'),
 (0.07087410334923262, '49_4606c3f8-26fc-4bea-8f34-a87a56574353'),
 (0.026052813848477557, '43_80a0bfe5-d348-4d31-8482-c7ce0988eee3'),
 (0.02240982391997432, '17_0dbb1ec5-a7f6-4581-ba62-8c28b10d8679'),
 (0.0, '9_fdc1ed7a-118b-4300-bc1d-0f46a1137e7a'),
 (0.0, '9_8dd92d30-7e39-4920-8423-7543712af4ad'),
 (0.0, '9_62595f2a-e5cc-467d-a4ce-9c5e67251c4d')]

In [None]:
class songRecommenderFix():
    