# Spotify Behaviors Project
### Brian Huang, Victor Thai, Annie Fan, Aishani Mohapatra

## Generating Sampled Data
With our dataset being over 500gbs, it was important that we sampled our data rather than loading it in as a whole.

```Python
from pyspark.sql import functions as f
from pyspark.sql import SparkSession

import os
import pandas as pd
import random

spark = SparkSession.builder.getOrCreate()
#start a spark session
spark_fp = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "training_set")
#replace with your filepath

df = spark.read.option("header", "true").csv(spark_fp) #load in the data 

ids = df.select('session_id').distinct() #get unique user/session ids for sampling. we want to sample by user

sampled_users = ids.orderBy(f.rand()).limit(50000) #sample the N users

sampled_users_list = list(sampled_users.toPandas()['session_id'])
samp_fracs = {key:1 for key in sampled_users_list}
#generate the fractions we need to sample from pyspark

samp_df = df.sampleBy("session_id", fractions = samp_fracs)
samp_df.write.csv("./sampled_users_100000.csv", header = True)
#write the file out
```

### Cluster Users

In [19]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
import os
import pandas as pd
import random

### Load in Data

In [25]:
spark = SparkSession.builder.getOrCreate()

In [26]:
spark_fp = os.path.join("sampled_users_100000.csv")
spark_fp

'sampled_users_100000.csv'

In [27]:
df = spark.read.option("header", "true").csv(spark_fp)

In [28]:
users = df.toPandas()
tf_path_one = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000000.csv")
tf_path_two = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000001.csv")

track_features_one = pd.read_csv(tf_path_one)
track_features_two = pd.read_csv(tf_path_two)

track_features = pd.concat([track_features_one, track_features_two])

userFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')
nonModifiedFeatures = pd.merge(users, track_features, left_on = 'track_id_clean', right_on = 'track_id')

### Clustering

In [29]:
cols = list(userFeatures.columns)
drop = cols[1:25]
userFeatures.drop(columns = drop, inplace = True)
userFeatures['mode'] = userFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)

features = userFeatures.groupby('session_id').mean()

In [30]:
X = features.reset_index().drop('session_id', axis = 1)
#we wanted to groupby so we would cluster by user avg song features

In [31]:
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters = 3)
cluster.fit(X)

KMeans(n_clusters=3)

In [32]:
y = userFeatures.groupby('session_id').mean()
#we clustered on session id, so y lets us add the labels by user
y['Cluster'] = cluster.labels_

In [33]:
userOne = y[y['Cluster'] == 0]
userTwo = y[y['Cluster'] == 1]
userThree = y[y['Cluster'] == 2]

### Cleaning Clustered Users

In [34]:
userOneFeatures = userOne.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')
userTwoFeatures = userTwo.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')
userThreeFeatures = userThree.merge(nonModifiedFeatures, on = 'session_id', how = 'inner')

userOneFeatures['not_skipped'] = userOneFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)
userTwoFeatures['not_skipped'] = userTwoFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)
userThreeFeatures['not_skipped'] = userThreeFeatures['not_skipped'].apply(lambda x: 1 if x == True else 0)

userOneFeatures['premium']= userOneFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userOneFeatures['hist_user_behavior_is_shuffle'] = userOneFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

userTwoFeatures['premium']= userTwoFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userTwoFeatures['hist_user_behavior_is_shuffle'] = userTwoFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

userThreeFeatures['premium']= userThreeFeatures['premium'].apply(lambda x: 1 if x is True else 0)
userThreeFeatures['hist_user_behavior_is_shuffle'] = userThreeFeatures['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

In [35]:
userOneFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)
userTwoFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)
userThreeFeatures.drop(['acousticness_x', 'beat_strength_x', 'bounciness_x',
       'danceability_x', 'dyn_range_mean_x', 'energy_x', 'flatness_x',
       'instrumentalness_x', 'key_x', 'liveness_x', 'loudness_x',
       'mechanism_x', 'mode_x', 'organism_x', 'speechiness_x', 'tempo_x',
       'time_signature_x', 'valence_x', 'acoustic_vector_0_x',
       'acoustic_vector_1_x', 'acoustic_vector_2_x', 'acoustic_vector_3_x',
       'acoustic_vector_4_x', 'acoustic_vector_5_x', 'acoustic_vector_6_x',
       'acoustic_vector_7_x', 'Cluster', 'session_position', 'session_length', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'duration', 'release_year',
       'us_popularity_estimate'], axis = 1, inplace = True)

In [36]:
userOneFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)
userTwoFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)
userThreeFeatures.rename(columns = lambda x: x[:-2] if x[-2:] == '_y' else x, inplace = True)

In [37]:
userOneFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

userTwoFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

userThreeFeatures.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end',
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [38]:
userOneFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

userTwoFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

userThreeFeatures.drop(['acoustic_vector_0',
 'acoustic_vector_1',
 'acoustic_vector_2',
 'acoustic_vector_3',
 'acoustic_vector_4',
 'acoustic_vector_5',
 'acoustic_vector_6',
 'acoustic_vector_7',
 'beat_strength',
 'bounciness',
 'dyn_range_mean',
 'flatness',
 'mechanism',
 'organism'], axis = 1, inplace = True)

userOneFeatures['mode'] = userOneFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)
userTwoFeatures['mode'] = userTwoFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)
userThreeFeatures['mode'] = userThreeFeatures['mode'].apply(lambda x: 1 if x == 'major' else 0)

### API Calls

In [39]:
from spotifyAPI import Spotify

In [40]:
s = Spotify()

Client ID:
········
Client Secret:
········


In [41]:
features = s.get_playlist_features('Top 50 - USA')

### Model

In [46]:
from songRecommender import songRecommender

In [49]:
model = songRecommender(data = userOneFeatures, predict = features)

In [50]:
model.similar(model.getPredict(), model.getFeatures())[:5]

[{'spotify:track:00Blm7zeNqgYLPtW6zg8cj': [(0.9398258408776682,
    '17_c91fefd2-fd8e-4f27-a9a4-f30e22790e3b')]},
 {'spotify:track:0gplL1WMoJ6iYaPgMCL0gX': [(0.9713047616087557,
    '21_1735273a-cef0-4c2e-8f4f-e3b232f373d1')]},
 {'spotify:track:27NovPIUIRrOZoCHxABJwK': [(0.9442731254539111,
    '21_ca06c6ed-568f-4263-b478-cb7414ba2fdf')]},
 {'spotify:track:4R67rQNSbbsR4TdUVOIdez': [(0.95308792646697,
    '6_a82686e9-df9e-47b9-8fcb-6876e1b8dfce')]},
 {'spotify:track:4iN16F8JtVxG2UTzp3avGl': [(0.9774440083591326,
    '6_49506d1f-60e7-4b71-a6d6-207377321f9d')]}]