In [104]:
import pandas as pd
import numpy as np

class BehaviorModel:
    '''Class for modeling user behavior in Spotify'''
    
    def __init__(self, user_list, df):
        self.model = {}
        self.df = df
        #key would be the session_id, value would be the centroids (one for skipped, one for not skipped)
        self.user_list = user_list
        self.universal_skipped = np.zeros(len(self.df.columns) - 2) #we subtract two for session_id and not_skipped
        self.universal_not_skipped = np.zeros(len(self.df.columns) - 2)
        self.cold_start = 0
    
    def train(self, df):
        '''
        user_list - a list of session_ids
        df = dataframe
        '''
        for user in self.user_list:
            temp_df = df[df['session_id'] == user]
            #filter our specific user
            
            group_df = temp_df.groupby('not_skipped').mean()
            #group by skipped or not
            
            centroid_skipped = np.array(group_df.iloc[0])
            #grab all the skipped averages, vectorize it into np array
            centroid_not_skipped = np.array(group_df.iloc[1])
            #grab all the skipped, vectorize it into np array
            
            self.model['user'] = [centroid_skipped, centroid_not_skipped]
            #index 0 is skipped, 1 is not skipped
            
            self.universal_skipped += centroid_skipped
            self.universal_not_skipped += centroid_not_skipped
        
        if self.cold_start > 0:
            self.universal_skipped /= self.cold_start
            self.universal_not_skipped /= self.cold_start
            
    
    def predict(self, user_id, track_id):
        '''
        user_id - the id of the user we want to predict
        track_id - the feature vector for the specific song
        '''
        if user_id in self.user_list:
            centroid_skipped, centroid_not_skipped = self.model['user'][0], self.model['user'][1]
            
            skipped = np.linalg.norm(track_id - centroid_skipped)
            not_skipped = np.linalg.norm(track_id - centroid_not_skipped)
            #euclidean distance for track_id vector and skipped/not_skipped centroids
            
            return skipped < not_skipped
            #return true for skipped, false for not skipped
        else:
            '''
            what happens if for example we have:
            two different users who listened to a single song in a session
            - one skipped
            - one did not
            
            both would generate a distance of zero, how do we know if they skipped or not skipped
            current solution: randomly generate skipped or not skipped
            '''
            
            self.cold_start += 1
        
            #cold start solution for user_id?
            skipped = np.linalg.norm(track_id - self.universal_skipped)
            not_skipped = np.linalg.norm(track_id - self.universal_not_skipped)
            
            return skipped < not_skipped
            
            

In [101]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
import os
import pandas as pd

In [3]:
spark = SparkSession.builder.getOrCreate()
#start a spark session

In [4]:
spark_fp = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "sample_sets", "track_features_subset_0.csv")
spark_fp

'/Volumes/Marceline Jr./Spotify Dataset/sample_sets/track_features_subset_0.csv'

In [5]:
df = spark.read.load(spark_fp, 
                      format="csv", inferSchema="true", header="true")
#load our dataframe lazily so we can sample from it

In [6]:
pandas_df = df.limit(1000000).toPandas()
#if your spark session crashes, reduce the limit of the items you want

In [7]:
tf_path_one = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000000.csv")
tf_path_two = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000001.csv")

In [8]:
track_features_one = pd.read_csv(tf_path_one)
track_features_two = pd.read_csv(tf_path_two)

In [9]:
track_features = pd.concat([track_features_one, track_features_two])

In [115]:
df = pd.merge(pandas_df, track_features, left_on = 'track_id_clean', right_on = 'track_id')

In [118]:
track = df[df['session_id'] == '15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1']
track

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,...,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7
382034,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,1,10,t_4da9d26e-4586-4de8-9e9b-4c7254401217,False,False,False,True,0,0,...,4,0.569242,-0.065038,0.673086,-0.13073,-0.513597,0.135511,0.171054,-0.202468,0.441188
382035,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,9,10,t_4da9d26e-4586-4de8-9e9b-4c7254401217,False,False,False,True,0,1,...,4,0.569242,-0.065038,0.673086,-0.13073,-0.513597,0.135511,0.171054,-0.202468,0.441188
747564,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,3,10,t_9c38085f-2d96-49a3-b415-c759b4cc4aa4,False,False,False,True,0,1,...,4,0.497076,-0.065156,0.6972,-0.101994,-0.536196,0.106963,0.153413,-0.248453,0.323199
765753,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,5,10,t_71a9790b-bb35-439d-b1dd-73e1da07a1bf,False,False,True,False,0,0,...,4,0.70106,-0.011229,0.712856,-0.200752,-0.549358,0.143816,0.172931,-0.210969,0.607714


In [119]:
df.drop(['skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end', 
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [120]:
df['not_skipped'] = df['not_skipped'].apply(lambda x: 1 if x == True else 0)

In [121]:
df['premium']= df['premium'].apply(lambda x: 1 if x is True else 0)
df['hist_user_behavior_is_shuffle'] = df['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

In [122]:
df[df['session_id'] == '15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1']

Unnamed: 0,session_id,session_position,session_length,track_id_clean,not_skipped,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,...,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7
382034,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,1,10,t_4da9d26e-4586-4de8-9e9b-4c7254401217,1,0,0,0,0,0,...,4,0.569242,-0.065038,0.673086,-0.13073,-0.513597,0.135511,0.171054,-0.202468,0.441188
382035,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,9,10,t_4da9d26e-4586-4de8-9e9b-4c7254401217,1,0,1,0,0,0,...,4,0.569242,-0.065038,0.673086,-0.13073,-0.513597,0.135511,0.171054,-0.202468,0.441188
747564,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,3,10,t_9c38085f-2d96-49a3-b415-c759b4cc4aa4,1,0,1,0,0,0,...,4,0.497076,-0.065156,0.6972,-0.101994,-0.536196,0.106963,0.153413,-0.248453,0.323199
765753,15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1,5,10,t_71a9790b-bb35-439d-b1dd-73e1da07a1bf,0,0,0,1,1,0,...,4,0.70106,-0.011229,0.712856,-0.200752,-0.549358,0.143816,0.172931,-0.210969,0.607714


# Preprocessing

In [123]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

In [137]:
as_is = ['session_position', 'session_length','hist_user_behavior_is_shuffle',
       'hour_of_day','premium','duration', 
       'release_year', 'us_popularity_estimate', 'acousticness',
       'beat_strength', 'bounciness', 'danceability', 'dyn_range_mean',
       'energy', 'flatness', 'instrumentalness', 'liveness', 'loudness',
       'mechanism', 'key', 'organism', 'speechiness', 'tempo',
       'time_signature', 'valence', 'acoustic_vector_0', 'acoustic_vector_1',
       'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4',
       'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7', 'context_switch', 'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'session_id', 'track_id_clean']
ohe = ['mode','context_type', 'hist_user_behavior_reason_start']

In [138]:
preproc = ColumnTransformer(
    transformers = [
        ('as_is', FunctionTransformer(lambda x: x), as_is),
        ('one_hot', OneHotEncoder(handle_unknown = 'ignore'), ohe),
    ]
)

In [139]:
pl = Pipeline(steps = [('preprocessor', preproc)])

In [140]:
not_skipped = df['not_skipped']
not_skipped

0         1
1         1
2         1
3         1
4         0
         ..
999995    0
999996    1
999997    0
999998    0
999999    0
Name: not_skipped, Length: 1000000, dtype: int64

In [158]:
transformed = pd.DataFrame(pl.fit_transform(df))

In [159]:
transformed = transformed.assign(not_skipped = not_skipped)

In [160]:
transformed = transformed.astype({i: 'float' for i in range(len(transformed.columns)-1) if i != 39 or i != 40}, errors = 'ignore')

In [161]:
transformed.rename({39:"session_id"},axis = 1, inplace = True)
transformed.rename({40:"track_id"},axis = 1, inplace = True)

In [162]:
transformed.iloc[:, 40:50]

Unnamed: 0,track_id,41,42,43,44,45,46,47,48,49
0,t_1131add1-4106-4a84-a117-63b8145cb4a8,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,t_1131add1-4106-4a84-a117-63b8145cb4a8,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,t_1131add1-4106-4a84-a117-63b8145cb4a8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,t_1131add1-4106-4a84-a117-63b8145cb4a8,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,t_1131add1-4106-4a84-a117-63b8145cb4a8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
999995,t_bc66f6fd-d5fa-4ce4-ad22-3fd547568e40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
999996,t_72b1ddec-7654-4333-b752-596d53193218,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
999997,t_7841dbcd-44e4-427c-b00a-b2e99eb4a608,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
999998,t_b562eb55-9cea-4a34-b25b-cde66b0fc4a5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
len(transformed.columns) - 1

59

In [164]:
transformed.dtypes

0              float64
1              float64
2              float64
3              float64
4              float64
5              float64
6              float64
7              float64
8              float64
9              float64
10             float64
11             float64
12             float64
13             float64
14             float64
15             float64
16             float64
17             float64
18             float64
19             float64
20             float64
21             float64
22             float64
23             float64
24             float64
25             float64
26             float64
27             float64
28             float64
29             float64
30             float64
31             float64
32             float64
33             float64
34             float64
35             float64
36             float64
37             float64
38             float64
session_id      object
track_id        object
41             float64
42             float64
43         

In [168]:
transformed[transformed['session_id'] == '15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1']['track_id']

382034    t_4da9d26e-4586-4de8-9e9b-4c7254401217
382035    t_4da9d26e-4586-4de8-9e9b-4c7254401217
747564    t_9c38085f-2d96-49a3-b415-c759b4cc4aa4
765753    t_71a9790b-bb35-439d-b1dd-73e1da07a1bf
Name: track_id, dtype: object

In [178]:
transformed[transformed['track_id'] == 't_4da9d26e-4586-4de8-9e9b-4c7254401217'][transformed['session_id'] == '15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1'][0]

  transformed[transformed['track_id'] == 't_4da9d26e-4586-4de8-9e9b-4c7254401217'][transformed['session_id'] == '15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,not_skipped
382034,1.0,10.0,0.0,15.0,1.0,224.0,2017.0,99.986662,4.5e-05,0.318477,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
382035,9.0,10.0,0.0,15.0,1.0,224.0,2017.0,99.986662,4.5e-05,0.318477,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [186]:
transformed.drop('track_id', axis = 1, inplace = True)

In [187]:
model = BehaviorModel(['15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1'], transformed)

In [188]:
model.train(transformed)

In [None]:
model.predict('15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1',)

In [86]:
temp_df = transformed[transformed['session_id'] == '15_380928ea-5bd7-4b06-8bd9-91c33bb1c0a1']
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,not_skipped
382034,1.0,10.0,0.0,15.0,1.0,224.0,2017.0,99.986662,4.5e-05,0.318477,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
382035,9.0,10.0,0.0,15.0,1.0,224.0,2017.0,99.986662,4.5e-05,0.318477,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
747564,3.0,10.0,0.0,15.0,1.0,201.479996,2016.0,99.984167,0.000475,0.341451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
765753,5.0,10.0,0.0,15.0,1.0,226.792831,2016.0,99.961557,3.3e-05,0.30012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [180]:
group_df = temp_df.groupby('not_skipped').mean()
group_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
not_skipped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.0,10.0,0.0,15.0,1.0,226.792831,2016.0,99.961557,3.3e-05,0.30012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4.333333,10.0,0.0,15.0,1.0,216.493332,2016.666667,99.98583,0.000189,0.326135,...,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0


In [88]:
centroid_skipped = group_df.iloc[0]
centroid_skipped

0        5.000000
1       10.000000
2        0.000000
3       15.000000
4        1.000000
5      226.792831
6     2016.000000
7       99.961557
8        0.000033
9        0.300120
10       0.250326
11       0.405019
12       4.874842
13       0.952840
14       0.961378
15       0.000331
16       0.220723
17      -3.530000
18       0.903226
19       6.000000
20       0.068430
21       0.077746
22     169.973999
23       4.000000
24       0.701060
25      -0.011229
26       0.712856
27      -0.200752
28      -0.549358
29       0.143816
30       0.172931
31      -0.210969
32       0.607714
33       0.000000
34       0.000000
35       1.000000
36       1.000000
37       0.000000
38       1.000000
40       1.000000
41       0.000000
42       0.000000
43       0.000000
44       0.000000
45       0.000000
46       1.000000
47       0.000000
48       0.000000
49       0.000000
50       0.000000
51       0.000000
52       0.000000
53       0.000000
54       0.000000
55       0.000000
56       1

In [179]:
centroid_not_skipped = np.array(group_df.iloc[1])
centroid_not_skipped

array([ 4.33333333e+00,  1.00000000e+01,  0.00000000e+00,  1.50000000e+01,
        1.00000000e+00,  2.16493332e+02,  2.01666667e+03,  9.99858302e+01,
        1.88696872e-04,  3.26135099e-01,  2.89706826e-01,  5.00326931e-01,
        5.39810912e+00,  9.38764970e-01,  9.50814227e-01,  1.04328429e-04,
        4.08539514e-01, -3.20133336e+00,  7.67115196e-01,  4.33333333e+00,
        1.64674565e-01,  5.68555159e-02,  1.13353999e+02,  4.00000000e+00,
        5.45186490e-01, -6.50775234e-02,  6.81124151e-01, -1.21151465e-01,
       -5.21129608e-01,  1.25994794e-01,  1.65173605e-01, -2.17796529e-01,
        4.01858509e-01,  0.00000000e+00,  6.66666667e-01,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  6.66666667e-01,
        3.33333333e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        6.66666667e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [99]:
universal_skipped = np.zeros(len(transformed.columns))