In [1]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
import os
import pandas as pd
import random

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
spark_fp = os.path.join("sampled_users_100000.csv")
spark_fp

'sampled_users_100000.csv'

In [4]:
df = spark.read.option("header", "true").csv(spark_fp)

In [5]:
users = df.toPandas()

In [6]:
tf_path_one = os.path.join("data", "track_features", "tf_000000000000.csv")
tf_path_two = os.path.join("data", "track_features", "tf_000000000000.csv")

In [7]:
track_features_one = pd.read_csv(tf_path_one)
track_features_two = pd.read_csv(tf_path_two)

In [8]:
track_features = pd.concat([track_features_one, track_features_two])

In [9]:
userFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')
nonModifiedFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')

In [14]:
userFeatures.head(1)

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,...,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7
0,30_459c8917-3a1b-45b1-8c26-4dc86b179948,3,16,t_d5216514-209f-4ea2-9046-bb8efc8c7a2c,False,False,True,False,0,1,...,4,0.593578,0.076202,0.158479,0.005198,-0.408168,0.50601,0.202745,-0.514325,-0.497011


In [15]:
cols = list(userFeatures.columns)

In [16]:
drop = cols[1:25]

In [17]:
userFeatures.drop(columns = drop, inplace = True)

In [18]:
userFeatures['mode'] = userFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)

In [19]:
features = userFeatures.groupby('session_id').mean()

In [20]:
X = features.reset_index().drop('session_id', axis = 1)
#we wanted to groupby so we would cluster by user avg song features

Drop our code so we only have the track features for clustering

In [21]:
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters = 3)
cluster.fit(X)

KMeans(n_clusters=3)

In [22]:
cluster.labels_

array([1, 2, 0, ..., 1, 1, 2], dtype=int32)

grab our cluster labels and re-add them to our features so we have session_id and corresponding cluster

In [28]:
y = userFeatures.groupby('session_id').mean()
#we clustered on session id, so y lets us add the labels by user

In [29]:
y['Cluster'] = cluster.labels_

In [30]:
userOne = y[y['Cluster'] == 0]
userTwo = y[y['Cluster'] == 1]
userThree = y[y['Cluster'] == 2]

In [31]:
userOneFeatures = userOne.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')
userTwoFeatures = userTwo.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')
userThreeFeatures = userThree.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')

In [32]:
userOneFeatures['not_skipped'] = userOneFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)
userTwoFeatures['not_skipped'] = userTwoFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)
userThreeFeatures['not_skipped'] = userThreeFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)

In [33]:
userOneFeatures['premium']= userOneFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userOneFeatures['hist_user_behavior_is_shuffle'] = userOneFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

userTwoFeatures['premium']= userTwoFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userTwoFeatures['hist_user_behavior_is_shuffle'] = userTwoFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

userThreeFeatures['premium']= userThreeFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userThreeFeatures['hist_user_behavior_is_shuffle'] = userThreeFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

In [34]:
userOneFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)
userTwoFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)
userThreeFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)

In [35]:
userOneFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)
userTwoFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)
userThreeFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)

In [36]:
userOneFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [37]:
userTwoFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [38]:
userThreeFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [39]:
userOneFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

In [40]:
userTwoFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

In [41]:
userThreeFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

In [42]:
userOneFeatures['mode'] = userOneFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)
userTwoFeatures['mode'] = userTwoFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)
userThreeFeatures['mode'] = userThreeFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)

In [43]:
from spotifyAPI import Spotify

In [44]:
s = Spotify()

Client ID:
········
Client Secret:
········


In [45]:
features = s.get_playlist_features('Top 50 - USA')

In [206]:
import math
from spotifyAPI import Spotify

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import numpy as np

class songRecommender():

    data = {}
    features = []
    predictFeatures = []

    def __init__(self, data, predict):
        '''
        data - our persona user's information
        predict - the new songs from the API
        '''
        
        self.data = self.parseData(self.dataPreprocessing(data))
        #parse the new data
        self.features = self.featureVector(self.data) 
        #generate features for the new data
        self.predictFeatures = self.featureAPIVector(predict)
        self.predictFeatures = self.scaleAPI(self.getPredict())
        #clean the api data
        
    def dataPreprocessing(self, data):
        
        cols = data.columns
        
        ss = ['acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']
        
        as_is = ['session_id']
        
        preproc = ColumnTransformer(
            transformers = [
                ('as_is', FunctionTransformer(lambda x: x), as_is),
                ('standard_scale', StandardScaler(), ss),
            ]
        )
        
        processed = pd.DataFrame(preproc.fit_transform(data), columns = cols)
        return processed
    
    def scaleAPI(self, data):
        p = data

        ss = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
           'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
           'time_signature', 'valence']

        preproc = ColumnTransformer(
            transformers = [
                ('standard_scale', StandardScaler(), ss)
            ]
        )

        df = pd.DataFrame()
        for entry in p:
            temp = pd.DataFrame.from_dict(data = entry[1], orient = 'index').T
            df = pd.concat([temp, df])
        transformed = pd.DataFrame(preproc.fit_transform(df), columns = ss).to_dict(orient = 'records')
        transformedPredict = []

        for i in range(len(p)):
            transformedPredict.append((p[i][0], transformed[i])) 

        return transformedPredict

    def parseData(self, data):

        import json

        parsed = json.loads(data.to_json(orient = 'records'))
        cleaned = {}

        for line in parsed:


            featuresSet = ['acousticness', 'beat_strength', 'bounciness', 'danceability',
               'dyn_range_mean', 'energy', 'flatness', 'instrumentalness', 'key',
               'liveness', 'loudness', 'mechanism', 'mode', 'organism', 'speechiness',
               'tempo', 'time_signature', 'valence', 'acoustic_vector_0',
               'acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3',
               'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6',
               'acoustic_vector_7']
            #get only user behaviors

            featuresDict = {k:v for k,v in line.items() if k in featuresSet}
            cleaned[line['session_id']] = featuresDict

        return cleaned

    def featureVector(self, data):
        #transform our dictionary of song features into a matrix of feature vectors
        vector = []

        for k in data:
            d = dict(sorted(data[k].items()))
            vector.append((k, d))

        return vector

    def featureAPIVector(self, data):
        #transform our API features into usable data
        vector = []
        keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
        for d in data:
            temp = {k:v for k, v in d.items() if k in keep}
            temp = dict(sorted(temp.items()))
            vector.append((d['uri'],temp))

        return vector

    def getData(self):
        return self.data
    
    def getFeatures(self):
        return self.features
    
    def getPredict(self):
        return self.predictFeatures

    def cosine(self, feature, features, N):
        '''
        feature - a feature vector of tuples, with index 0 being link and 1 being the vector
        feature is the song from the API
        features - all feature vectors belonging to current persona user
        all the songs in our generated user (data)
        N - number of similiar songs we want to return
        '''
        similarities = []

        numer = 0
        denom1 = 0
        denom2 = 0

        for featureTwo in features:
            sim = 0
            numer = sum([a * b for a, b in zip(list(feature[1].values()), list(featureTwo[1].values()))])
            denom1 = sum([l ** 2 for l in list(feature[1].values())])
            denom2 = sum([l ** 2 for l in list(featureTwo[1].values())])
            denom = math.sqrt(denom1) * math.sqrt(denom2)
            if denom == 0:
                sim = 0
            sim = numer/denom

            similarities.append((sim, featureTwo[0]))

        similarities.sort(reverse = True)
        return similarities[:N]
    
    def similar(self, X, y):
        predictions = []
        for feature in X:
            entry = {feature[0]:self.cosine(feature, y, 1)}
            #figure out why it keeps returning 10 entries
            predictions.append(entry)
        return predictions


In [210]:
model = songRecommender(data = userOneFeatures, predict = features)

In [212]:
model.similar(model.getPredict(), model.getFeatures())[:5]

[{'spotify:track:00Blm7zeNqgYLPtW6zg8cj': [(0.9253165736736747,
    '65_04308840-6345-4fe5-a5f3-631e7c3a0ff5')]},
 {'spotify:track:0gplL1WMoJ6iYaPgMCL0gX': [(0.9419606161901593,
    '48_6f87f333-6bf3-4c4f-86a2-54f63f2dfcfe')]},
 {'spotify:track:27NovPIUIRrOZoCHxABJwK': [(0.9478417575519787,
    '52_48b015ff-bd82-45f3-9548-407271d1cbb2')]},
 {'spotify:track:4R67rQNSbbsR4TdUVOIdez': [(0.9721407698947252,
    '9_6d01c643-1311-4efb-9953-986e23fa3954')]},
 {'spotify:track:4iN16F8JtVxG2UTzp3avGl': [(0.9375976869649639,
    '34_6d6ec1ac-958d-4bd4-9a19-f26ba36162f7')]}]

In [247]:
# #if we wanted to get nearest centroid
# from sklearn.neighbors.nearest_centroid import NearestCentroid
# clf = NearestCentroid()
# clf.fit(features, userFeatures['session_id'].unique())
# clf.centroids_