In [1]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
import os
import pandas as pd

# Load the data in using PySpark

In [2]:
spark = SparkSession.builder.getOrCreate()
#start a spark session

In [3]:
spark_fp = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "sample_sets", "track_features_subset_0.csv")
spark_fp

'/Volumes/Marceline Jr./Spotify Dataset/sample_sets/track_features_subset_0.csv'

In [4]:
df = spark.read.load(spark_fp, 
                      format="csv", inferSchema="true", header="true")
#load our dataframe lazily so we can sample from it

In [5]:
pandas_df = df.limit(1000000).toPandas()
#if your spark session crashes, reduce the limit of the items you want

# Joining our Behavior Data with Track Features

In [6]:
tf_path_one = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000000.csv")
tf_path_two = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000001.csv")

In [7]:
track_features_one = pd.read_csv(tf_path_one)
track_features_two = pd.read_csv(tf_path_two)

In [8]:
track_features = pd.concat([track_features_one, track_features_two])

In [9]:
df = pd.merge(pandas_df, track_features, left_on = 'track_id_clean', right_on = 'track_id')

# Cleaning

In [11]:
df.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,...,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7
0,5_000074db-eb0b-4ecc-a9db-0e8e39f10198,9,10,t_1131add1-4106-4a84-a117-63b8145cb4a8,False,False,False,True,0,1,...,4,0.47763,-0.234292,0.358869,0.322848,-0.056939,-0.41339,0.21536,0.397757,-0.038523
1,3_e6e51948-e584-4765-b60c-ba039817d8e0,11,14,t_1131add1-4106-4a84-a117-63b8145cb4a8,False,False,False,True,0,1,...,4,0.47763,-0.234292,0.358869,0.322848,-0.056939,-0.41339,0.21536,0.397757,-0.038523
2,12_57db7507-b09f-49d5-a9bf-5aefe0ad15ec,16,20,t_1131add1-4106-4a84-a117-63b8145cb4a8,False,False,False,True,0,1,...,4,0.47763,-0.234292,0.358869,0.322848,-0.056939,-0.41339,0.21536,0.397757,-0.038523
3,53_9efd94ae-4f26-420f-96d9-eeb0996e1a89,9,10,t_1131add1-4106-4a84-a117-63b8145cb4a8,False,False,False,True,0,1,...,4,0.47763,-0.234292,0.358869,0.322848,-0.056939,-0.41339,0.21536,0.397757,-0.038523
4,54_b1108c89-9600-4026-af43-950bdb04b4d5,3,14,t_1131add1-4106-4a84-a117-63b8145cb4a8,False,False,True,False,0,0,...,4,0.47763,-0.234292,0.358869,0.322848,-0.056939,-0.41339,0.21536,0.397757,-0.038523


In [12]:
df.columns

Index(['session_id', 'session_position', 'session_length', 'track_id_clean',
       'skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hour_of_day', 'date', 'premium', 'context_type',
       'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end',
       'track_id', 'duration', 'release_year', 'us_popularity_estimate',
       'acousticness', 'beat_strength', 'bounciness', 'danceability',
       'dyn_range_mean', 'energy', 'flatness', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mechanism', 'mode', 'organism', 'speechiness',
       'tempo', 'time_signature', 'valence', 'acoustic_vector_0',
       'acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3',
       'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6',
       'acoustic_vector

In [13]:
df.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end', 
         'session_id', #session id would bring our sklearn model down bc its not just a single value
         #different users don't consistently listen to the same amount of songs, vectors could be differing size
         #listen to different types of songs
         #session_id does not encode that information, it tells me nothing about my user's behavior even thought it is a numrical value
         #session_id points to the group of songs that the user listens to
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [16]:
df['not_skipped'] = df['not_skipped'].apply(lambda x: 1 if x == True else 0)

In [17]:
df['premium']= df['premium'].apply(lambda x: 1 if x is True else 0)
df['hist_user_behavior_is_shuffle'] = df['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

# Baseline Model

In [15]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

In [18]:
as_is = ['session_position', 'session_length','hist_user_behavior_is_shuffle',
       'hour_of_day','premium','duration', 
       'release_year', 'us_popularity_estimate', 'acousticness',
       'beat_strength', 'bounciness', 'danceability', 'dyn_range_mean',
       'energy', 'flatness', 'instrumentalness', 'liveness', 'loudness',
       'mechanism', 'key', 'organism', 'speechiness', 'tempo',
       'time_signature', 'valence', 'acoustic_vector_0', 'acoustic_vector_1',
       'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4',
       'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7', 'context_switch', 'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback']
ohe = ['mode','context_type', 'hist_user_behavior_reason_start']

In [19]:
preproc = ColumnTransformer(
    transformers = [
        ('as_is', FunctionTransformer(lambda x: x), as_is),
        ('one_hot', OneHotEncoder(handle_unknown = 'ignore'), ohe),
    ]
)

In [37]:
pl = Pipeline(steps = [('preprocessor', preproc), ('classifier', DecisionTreeClassifier(max_depth = 10))])
x_train, x_test, y_train, y_test = train_test_split(df.drop('not_skipped', axis = 1), df['not_skipped'], test_size= 0.3)

In [38]:
pl.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('as_is',
                                                  FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                  ['session_position',
                                                   'session_length',
                                                   'hist_user_behavior_is_shuffle',
                                                   'hour_of_day', 'premium',
                                                   'duration', 'release_year',
                                                   'us_popularity_estimate',
                                                   'acousticness',
                                                   'beat_strength',
                                                   'bounciness', 'danceability',
                                                   'dyn_range_me...
                                                   '

In [39]:
pl.score(x_test, y_test)

0.8062433333333333

# Model Selection

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
parameters = {'classifier__max_depth': [2, 4, 8, 16, 32, 64, 128]}

In [42]:
clf = GridSearchCV(pl, parameters)

In [43]:
clf.fit(df.drop('not_skipped', axis = 1), df['not_skipped'])

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('as_is',
                                                                         FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                                         ['session_position',
                                                                          'session_length',
                                                                          'hist_user_behavior_is_shuffle',
                                                                          'hour_of_day',
                                                                          'premium',
                                                                          'duration',
                                                                          'release_year',
                                                                          'us_pop

In [44]:
pd.DataFrame(clf.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,15.058947,0.749541,0.33078,0.036264,8,{'classifier__max_depth': 8},0.796565,0.79937,0.80723,0.8083,0.81724,0.805741,0.007291,1
1,11.140927,0.691851,0.416819,0.01009,4,{'classifier__max_depth': 4},0.796015,0.79755,0.805635,0.80706,0.815775,0.804407,0.007145,2
0,6.880691,0.555506,0.44385,0.020228,2,{'classifier__max_depth': 2},0.78893,0.791805,0.799395,0.80206,0.81109,0.798656,0.007852,3
3,33.722463,3.229069,0.452705,0.011839,16,{'classifier__max_depth': 16},0.78687,0.789775,0.7994,0.79919,0.80077,0.795201,0.005717,4
4,45.220576,3.340465,0.481467,0.01119,32,{'classifier__max_depth': 32},0.69805,0.719485,0.74265,0.730505,0.722855,0.722709,0.014676,5
6,62.722947,2.216196,0.525923,0.014851,128,{'classifier__max_depth': 128},0.668495,0.68524,0.704155,0.7016,0.70157,0.692212,0.013634,6
5,53.453913,1.149052,0.521956,0.040741,64,{'classifier__max_depth': 64},0.66578,0.685065,0.703895,0.70123,0.70159,0.691512,0.014512,7


In [49]:
parameters = {'classifier__max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12]}

In [50]:
clf = GridSearchCV(pl, parameters)

In [51]:
clf.fit(df.drop('not_skipped', axis = 1), df['not_skipped'])

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('as_is',
                                                                         FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                                         ['session_position',
                                                                          'session_length',
                                                                          'hist_user_behavior_is_shuffle',
                                                                          'hour_of_day',
                                                                          'premium',
                                                                          'duration',
                                                                          'release_year',
                                                                          'us_pop

In [53]:
pd.DataFrame(clf.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,23.395849,1.920926,0.426668,0.015727,9,{'classifier__max_depth': 9},0.79799,0.799875,0.80633,0.808395,0.817105,0.805939,0.006794,1
3,19.347779,1.507222,0.450366,0.012763,7,{'classifier__max_depth': 7},0.797855,0.799585,0.807075,0.808125,0.81701,0.80593,0.006846,2
6,25.924785,2.829886,0.439759,0.016299,10,{'classifier__max_depth': 10},0.79827,0.799805,0.807145,0.807885,0.815935,0.805808,0.006348,3
4,21.677549,1.282225,0.446597,0.008155,8,{'classifier__max_depth': 8},0.796555,0.799345,0.80726,0.80831,0.81727,0.805748,0.007309,4
2,18.711478,2.550253,0.485569,0.035937,6,{'classifier__max_depth': 6},0.79763,0.79912,0.805905,0.808245,0.81703,0.805586,0.006975,5
1,13.460729,1.022904,0.426731,0.012101,5,{'classifier__max_depth': 5},0.797455,0.798855,0.806515,0.808,0.81671,0.805507,0.006954,6
7,29.449067,1.766647,0.475826,0.07696,11,{'classifier__max_depth': 11},0.797165,0.79995,0.806515,0.807665,0.81472,0.805203,0.006173,7
0,11.219257,0.886961,0.430473,0.011639,4,{'classifier__max_depth': 4},0.796015,0.79755,0.805635,0.80706,0.815775,0.804407,0.007145,8
8,32.420366,3.544485,0.450531,0.015929,12,{'classifier__max_depth': 12},0.79599,0.79889,0.80642,0.806365,0.813685,0.80427,0.006248,9


In [None]:
pl = Pipeline(steps = [('preprocessor', preproc), ('classifier', DecisionTreeClassifier(max_depth = 9))])
x_train, x_test, y_train, y_test = train_test_split(df.drop('not_skipped', axis = 1), df['not_skipped'], test_size= 0.3)

In [55]:
pl.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('as_is',
                                                  FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                  ['session_position',
                                                   'session_length',
                                                   'hist_user_behavior_is_shuffle',
                                                   'hour_of_day', 'premium',
                                                   'duration', 'release_year',
                                                   'us_popularity_estimate',
                                                   'acousticness',
                                                   'beat_strength',
                                                   'bounciness', 'danceability',
                                                   'dyn_range_me...
                                                   '

In [56]:
pl.score(x_test,y_test)

0.8063466666666667

# Trying a Different Model

In [78]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
pl = Pipeline(steps = [('preprocessor', preproc), ('classifier', RandomForestClassifier(n_estimators = 8, n_jobs = -1))])
x_train, x_test, y_train, y_test = train_test_split(df.drop('not_skipped', axis = 1), df['not_skipped'], test_size= 0.3)

In [80]:
pl.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('as_is',
                                                  FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                  ['session_position',
                                                   'session_length',
                                                   'hist_user_behavior_is_shuffle',
                                                   'hour_of_day', 'premium',
                                                   'duration', 'release_year',
                                                   'us_popularity_estimate',
                                                   'acousticness',
                                                   'beat_strength',
                                                   'bounciness', 'danceability',
                                                   'dyn_range_me...
                                                   '

In [81]:
pl.score(x_test,y_test)

0.7837033333333333

In [84]:
parameters = {'classifier__n_estimators': [32, 64, 128, 256, 512, 1024]}

In [85]:
clf = GridSearchCV(pl, parameters)

In [86]:
clf.fit(df.drop('not_skipped', axis = 1), df['not_skipped'])

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('as_is',
                                                                         FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                                         ['session_position',
                                                                          'session_length',
                                                                          'hist_user_behavior_is_shuffle',
                                                                          'hour_of_day',
                                                                          'premium',
                                                                          'duration',
                                                                          'release_year',
                                                                          'us_pop

In [87]:
pd.DataFrame(clf.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,493.644721,77.079485,14.885325,4.15041,512,{'classifier__n_estimators': 512},0.78749,0.79635,0.80701,0.80911,0.81764,0.80352,0.010499,1
5,1272.756923,152.589729,37.646357,3.854909,1024,{'classifier__n_estimators': 1024},0.78714,0.796735,0.80653,0.808895,0.817605,0.803381,0.010494,2
3,301.794083,14.355689,5.154152,0.891372,256,{'classifier__n_estimators': 256},0.787365,0.79656,0.806785,0.80852,0.817365,0.803319,0.010359,3
2,158.939946,20.477871,2.975445,0.348243,128,{'classifier__n_estimators': 128},0.78678,0.795955,0.80616,0.80847,0.81687,0.802847,0.010439,4
1,103.558258,29.704227,3.679141,2.331547,64,{'classifier__n_estimators': 64},0.785345,0.79495,0.805725,0.807835,0.81625,0.802021,0.010752,5
0,36.325731,15.800015,1.075524,0.203144,32,{'classifier__n_estimators': 32},0.783085,0.792565,0.80409,0.806215,0.815035,0.800198,0.011159,6


In [72]:
parameters = {'classifier__n_estimators': [4, 5, 6, 7, 8, 9, 10, 11, 12]}
clf = GridSearchCV(pl, parameters)

In [73]:
clf.fit(df.drop('not_skipped', axis = 1), df['not_skipped'])

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('as_is',
                                                                         FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                                         ['session_position',
                                                                          'session_length',
                                                                          'hist_user_behavior_is_shuffle',
                                                                          'hour_of_day',
                                                                          'premium',
                                                                          'duration',
                                                                          'release_year',
                                                                          'us_pop

In [74]:
pd.DataFrame(clf.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,8.725957,0.236122,0.68597,0.043706,11,{'classifier__max_depth': 11},0.79612,0.796895,0.805025,0.806345,0.81434,0.803745,0.006719,1
8,10.641657,0.690064,0.832519,0.051622,12,{'classifier__max_depth': 12},0.793405,0.797965,0.80538,0.80645,0.81454,0.803548,0.007303,2
6,7.502289,0.08945,0.598998,0.023968,10,{'classifier__max_depth': 10},0.793865,0.797,0.804055,0.805475,0.81316,0.802711,0.006775,3
5,6.407775,0.358372,0.48848,0.046975,9,{'classifier__max_depth': 9},0.793945,0.79458,0.801815,0.80456,0.81525,0.80203,0.007773,4
4,6.241291,0.889217,0.502235,0.060703,8,{'classifier__max_depth': 8},0.793855,0.793375,0.802275,0.80473,0.813525,0.801552,0.007484,5
3,12.495577,8.042026,1.506721,1.268494,7,{'classifier__max_depth': 7},0.790735,0.79507,0.80095,0.805245,0.812245,0.800849,0.007548,6
2,5.634108,0.158232,0.504064,0.026732,6,{'classifier__max_depth': 6},0.78622,0.794165,0.799985,0.803345,0.808325,0.798408,0.007639,7
1,5.205431,0.098034,0.495176,0.009905,5,{'classifier__max_depth': 5},0.791425,0.78269,0.79561,0.80313,0.810925,0.796756,0.009685,8
0,4.716809,0.084837,0.496604,0.016767,4,{'classifier__max_depth': 4},0.77277,0.79242,0.799615,0.803345,0.784635,0.790557,0.010959,9


In [75]:
pl = Pipeline(steps = [('preprocessor', preproc), ('classifier', RandomForestClassifier(n_estimators = 11, n_jobs = -1))])
x_train, x_test, y_train, y_test = train_test_split(df.drop('not_skipped', axis = 1), df['not_skipped'], test_size= 0.3)

In [76]:
pl.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('as_is',
                                                  FunctionTransformer(func=<function <lambda> at 0x7f9c7c554f70>),
                                                  ['session_position',
                                                   'session_length',
                                                   'hist_user_behavior_is_shuffle',
                                                   'hour_of_day', 'premium',
                                                   'duration', 'release_year',
                                                   'us_popularity_estimate',
                                                   'acousticness',
                                                   'beat_strength',
                                                   'bounciness', 'danceability',
                                                   'dyn_range_me...
                                                   '

In [77]:
pl.score(x_test, y_test)

0.78413