In [1]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
import os
import pandas as pd
import random

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
spark_fp = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "sampled_users.csv")
spark_fp

'/Volumes/Marceline Jr./Spotify Dataset/sampled_users.csv'

In [4]:
df = spark.read.option("header", "true").csv(spark_fp)

In [5]:
users = df.toPandas()

In [6]:
tf_path_one = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000000.csv")
tf_path_two = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000001.csv")

In [7]:
track_features_one = pd.read_csv(tf_path_one)
track_features_two = pd.read_csv(tf_path_two)

In [8]:
track_features = pd.concat([track_features_one, track_features_two])

In [9]:
userFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')
nonModifiedFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')

In [10]:
userFeatures.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,...,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7
0,42_50cb77d0-9f69-4948-98a6-445f4f1d98da,1,18,t_3bc02be3-ab02-4c9f-94af-04910b00a14c,True,True,True,False,0,0,...,4,0.332708,-0.747017,0.236945,0.15764,0.053084,-0.247379,-0.015617,-0.557144,0.094659
1,42_50cb77d0-9f69-4948-98a6-445f4f1d98da,2,18,t_529537c9-bfb0-4058-bb9a-1d93ef071263,True,True,True,False,0,1,...,4,0.081849,-0.859858,0.301731,0.208637,0.108421,-0.310408,-0.036791,-0.545892,0.162957
2,42_50cb77d0-9f69-4948-98a6-445f4f1d98da,3,18,t_c29129f2-7b64-4c1e-9587-676632b7800d,True,True,True,False,0,0,...,4,0.325166,-0.517902,0.00619,0.087467,0.19187,-0.131095,-0.131147,-0.435564,-0.100854
3,42_50cb77d0-9f69-4948-98a6-445f4f1d98da,4,18,t_6c1ce50d-d33a-4502-8c58-335f23b03145,True,True,True,False,0,1,...,4,0.290325,-0.854975,0.354534,0.233016,0.039705,-0.422565,0.041233,-0.445123,0.352062
4,42_50cb77d0-9f69-4948-98a6-445f4f1d98da,5,18,t_85418d4c-2b2d-4270-836e-ef789bb28555,True,True,True,False,0,1,...,4,0.425356,-0.891446,0.229755,0.25636,0.193858,-0.322325,-0.040302,-0.553673,0.162334


In [11]:
cols = list(userFeatures.columns)

In [12]:
drop = cols[1:25]

In [13]:
userFeatures.drop(columns = drop, inplace = True)

In [14]:
userFeatures['mode'] = userFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)

In [15]:
features = userFeatures.groupby('session_id').mean()

In [16]:
X = features.reset_index().drop('session_id', axis = 1)
#we wanted to groupby so we would cluster by user avg song features

Drop our code so we only have the track features for clustering

In [17]:
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters = 3)
cluster.fit(X)

KMeans(n_clusters=3)

In [18]:
cluster.labels_

array([1, 2, 0, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 1, 0, 1, 2, 2, 2, 0,
       2, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0, 1, 1,
       2, 2, 0, 2, 1, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 2, 0, 2, 0,
       2, 0, 2, 0, 0, 0, 0, 1, 2, 1, 0, 2, 0, 1, 2, 2, 2, 1, 0, 0, 2, 2,
       2, 1, 1, 2, 1, 2, 2, 0, 0, 1, 2, 1], dtype=int32)

grab our cluster labels and re-add them to our features so we have session_id and corresponding cluster

In [19]:
y = userFeatures.groupby('session_id').mean()

In [20]:
y['Cluster'] = cluster.labels_

In [21]:
userOne = y[y['Cluster'] == 0]
userTwo = y[y['Cluster'] == 1]
userThree = y[y['Cluster'] == 2]

In [22]:
userOneFeatures = userOne.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')
userTwoFeatures = userTwo.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')
userThreeFeatures = userThree.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')

In [23]:
userOneFeatures['not_skipped'] = userOneFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)
userTwoFeatures['not_skipped'] = userTwoFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)
userThreeFeatures['not_skipped'] = userThreeFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)

In [24]:
userOneFeatures['premium']= userOneFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userOneFeatures['hist_user_behavior_is_shuffle'] = userOneFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

userTwoFeatures['premium']= userTwoFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userTwoFeatures['hist_user_behavior_is_shuffle'] = userTwoFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

userThreeFeatures['premium']= userThreeFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userThreeFeatures['hist_user_behavior_is_shuffle'] = userThreeFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

In [25]:
userOneFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)
userTwoFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)
userThreeFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)

In [26]:
userOneFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)
userTwoFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)
userThreeFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)

In [27]:
userOneFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [28]:
userTwoFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [29]:
userThreeFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [88]:
userOneFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

In [89]:
userTwoFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

In [90]:
userThreeFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

In [192]:
userOneFeatures['mode'] = userOneFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)
userTwoFeatures['mode'] = userTwoFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)
userThreeFeatures['mode'] = userThreeFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)

In [194]:
from spotifyAPI import Spotify

In [195]:
s = Spotify()

Client ID:
········
Client Secret:
········


In [197]:
features = s.get_playlist_features('Top 50 - USA')

In [306]:
from sklearn.decomposition import PCA

In [292]:
import math
from spotifyAPI import Spotify

class songRecommender():

    data = {}
    features = []
    predictFeatures = []

    def __init__(self, data, predict):
        '''
        data - our persona user's information
        predict - the new songs from the API
        '''
        
        
        self.data = self.parseData(data)
        #parse the new data
        self.features = self.featureVector(self.data) #apply PCA
        #generate features for the new data
        self.predictFeatures = self.featureAPIVector(predict) #apply PCA
        #clean the api data

    def parseData(self, data):

        import json

        parsed = json.loads(data.to_json(orient = 'records'))
        cleaned = {}

        for line in parsed:


            featuresSet = ['acousticness', 'beat_strength', 'bounciness', 'danceability',
               'dyn_range_mean', 'energy', 'flatness', 'instrumentalness', 'key',
               'liveness', 'loudness', 'mechanism', 'mode', 'organism', 'speechiness',
               'tempo', 'time_signature', 'valence', 'acoustic_vector_0',
               'acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3',
               'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6',
               'acoustic_vector_7']
            #get only user behaviors

            featuresDict = {k:v for k,v in line.items() if k in featuresSet}
            cleaned[line['session_id']] = featuresDict

        return cleaned

    def featureVector(self, data):
        #transform our dictionary of song features into a matrix of feature vectors
        vector = []

        for k in data:
            d = dict(sorted(data[k].items()))
            vector.append((k, d))

        return vector

    def featureAPIVector(self, data):
        #transform our API features into usable data
        vector = []
        keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
        for d in data:
            temp = {k:v for k, v in d.items() if k in keep}
            temp = dict(sorted(temp.items()))
            vector.append((d['uri'],temp))

        return vector

    def getData(self):
        return self.data
    
    def getFeatures(self):
        return self.features
    
    def getPredict(self):
        return self.predictFeatures

    def cosine(self, feature, features, N):
        '''
        feature - a feature vector of tuples, with index 0 being link and 1 being the vector
        feature is the song from the API
        features - all feature vectors belonging to current persona user
        all the songs in our generated user (data)
        N - number of similiar songs we want to return
        '''
        similarities = []

        numer = 0
        denom1 = 0
        denom2 = 0

        for featureTwo in features:
            sim = 0
            numer = sum([a * b for a, b in zip(list(feature[1].values()), list(featureTwo[1].values()))])
            denom1 = sum([l ** 2 for l in list(feature[1].values())])
            denom2 = sum([l ** 2 for l in list(featureTwo[1].values())])
            denom = math.sqrt(denom1) * math.sqrt(denom2)
            if denom == 0:
                sim = 0
            sim = numer/denom

            similarities.append((sim, featureTwo[0]))

        similarities.sort(reverse = True)
        return similarities[:N]
    
    def similar(self, X, y):
        predictions = []
        for feature in X:
            entry = {feature[0]:cosine(feature, y, 1)[0]}
            #figure out why it keeps returning 10 entries
            predictions.append(entry)
        return predictions


In [293]:
model = songRecommender(data = userOneFeatures, predict = features)

In [304]:
model.similar(model.getPredict(), model.getFeatures())[:5]

[{'spotify:track:27NovPIUIRrOZoCHxABJwK': (0.9998024729284718,
   '16_7820f5ac-fe1a-4129-88e0-d934103cf8ed')},
 {'spotify:track:0gplL1WMoJ6iYaPgMCL0gX': (0.9999051694657801,
   '23_ba2228a3-64c9-4df0-affc-0103afff056f')},
 {'spotify:track:00Blm7zeNqgYLPtW6zg8cj': (0.999894896931811,
   '58_1a705cba-7d2d-418b-9e6c-365e4d578118')},
 {'spotify:track:5HCyWlXZPP0y6Gqq8TgA20': (0.9999660617206557,
   '16_7820f5ac-fe1a-4129-88e0-d934103cf8ed')},
 {'spotify:track:4R67rQNSbbsR4TdUVOIdez': (0.9999225756270735,
   '23_ba2228a3-64c9-4df0-affc-0103afff056f')}]

In [247]:
# #if we wanted to get nearest centroid
# from sklearn.neighbors.nearest_centroid import NearestCentroid
# clf = NearestCentroid()
# clf.fit(features, userFeatures['session_id'].unique())
# clf.centroids_