In [1]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession

import os
import pandas as pd
import random

# Load in the Data with PySpark

In [5]:
spark = SparkSession.builder.getOrCreate()

In [8]:
spark_fp = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "training_set")
spark_fp

'/Volumes/Marceline Jr./Spotify Dataset/training_set'

In [9]:
df = spark.read.option("header", "true").csv(spark_fp)

In [10]:
df.createOrReplaceTempView("table")

In [15]:
df.select("session_id").show(10)

+--------------------+
|          session_id|
+--------------------+
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
|0_00006f66-33e5-4...|
+--------------------+
only showing top 10 rows



In [16]:
ids = df.select('session_id').distinct()

In [23]:
sampled_users = ids.orderBy(f.rand()).limit(100)

In [24]:
sampled_users_list = list(sampled_users.toPandas()['session_id'])

In [25]:
samp_fracs = {key:1 for key in sampled_users}

TypeError: unhashable type: 'Column'

In [None]:
samp_df = df.sampleBy("session_id", fractions = samp_fracs)

In [None]:
samp_df.write.csv("./sampled_users.csv", header = True)

# Generate Unique Users and Their Behaviors

# Load in the Track Features

In [None]:
tf_path_one = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000000.csv")
tf_path_two = os.path.join("/", "Volumes", "Marceline Jr.", "Spotify Dataset", "track_features", "tf_000000000001.csv")

In [None]:
track_features_one = pd.read_csv(tf_path_one)
track_features_two = pd.read_csv(tf_path_two)

In [None]:
track_features = pd.concat([track_features_one, track_features_two])

# Join the DataFrames Together

In [None]:
df = pd.merge(pandas_df, track_features, left_on = 'track_id_clean', right_on = 'track_id')

# Cleaning

In [None]:
df.drop(['track_id_clean', 
         'skip_1', 
         'skip_2', 
         'skip_3',
         'hist_user_behavior_reason_end', 
         'session_id', #session id would bring our sklearn model down bc its not just a single value
         #different users don't consistently listen to the same amount of songs, vectors could be differing size
         #listen to different types of songs
         #session_id does not encode that information, it tells me nothing about my user's behavior even thought it is a numrical value
         #session_id points to the group of songs that the user listens to
         'track_id',
         'date'], 
        axis = 1, inplace = True)

In [None]:
df['not_skipped'] = df['not_skipped'].apply(lambda x: 1 if x == True else 0)

In [None]:
df['premium']= df['premium'].apply(lambda x: 1 if x is True else 0)
df['hist_user_behavior_is_shuffle'] = df['hist_user_behavior_is_shuffle'].apply(lambda x: 1 if x is True else 0)

# Baseline Model

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

In [None]:
as_is = ['session_position', 'session_length','hist_user_behavior_is_shuffle',
       'hour_of_day','premium','duration', 
       'release_year', 'us_popularity_estimate', 'acousticness',
       'beat_strength', 'bounciness', 'danceability', 'dyn_range_mean',
       'energy', 'flatness', 'instrumentalness', 'liveness', 'loudness',
       'mechanism', 'key', 'organism', 'speechiness', 'tempo',
       'time_signature', 'valence', 'acoustic_vector_0', 'acoustic_vector_1',
       'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4',
       'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7', 'context_switch', 'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback']
ohe = ['mode','context_type', 'hist_user_behavior_reason_start']

In [None]:
preproc = ColumnTransformer(
    transformers = [
        ('as_is', FunctionTransformer(lambda x: x), as_is),
        ('one_hot', OneHotEncoder(handle_unknown = 'ignore'), ohe),
    ]
)

In [None]:
pl = Pipeline(steps = [('preprocessor', preproc), ('classifier', DecisionTreeClassifier(max_depth = 10))])
x_train, x_test, y_train, y_test = train_test_split(df.drop('not_skipped', axis = 1), df['not_skipped'], test_size= 0.3)

In [None]:
pl.fit(x_train, y_train)

In [None]:
pl.score(x_test,y_test)