In [1]:
import os
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd
import json

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *

## Instantiate Spotify Session

Creating connection so spotifies database to extract song and playlist data

In [86]:
#extract credentials from environment variabes and instantiate session
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager)

In [87]:
#grab playlists for user
myPlaylists = sp.user_playlists("ben.doan4366")

#extract tracks for gym and cooking playlists
gymPlaylist = sp.playlist_tracks("1Vu8EHOlNPBFVBxmKtGHlr")
cookingPlaylist = sp.playlist_tracks("5myx61jwlbF2iMDVWNK0Ps")

## Design Helper Functions to Transform Raw Data to Staging Table Format

In [88]:
#function to transition raw json response from Spotify to dataframe of artist and so

def prepArtistStagingTables(raw_playlist_dictionary):
    
    #trim meta data and extract track field from raw dictionary
    cleaned_playlist_dict = [sub['track'] for sub in raw_playlist_dictionary['items']]
    
    #generate raw dataframe
    playlist_df_raw = pd.read_json(json.dumps(cleaned_playlist_dict))
    
    #extract artist column from pd
    artist_df = playlist_df_raw.loc[:,["name","artists"]].explode("artists")
    
    #extract artist dictionary values into series, and recast to dataframe 
    artist_fields_expanded = artist_df["artists"].apply(pd.Series)
    artist_fields_expanded.rename(columns={"name":"artist_name"}, inplace=True)
    
    #concat recasted dataframe to original dataframe with song name
    #artist_df_final = pd.concat([artist_df, artist_fields_expanded], axis=1)
    artist_df_final = pd.concat([artist_df, artist_fields_expanded], axis=1).loc[:,["name", "id", "artist_name", "type", "uri"]]
    artist_df_final.rename(columns={"name":"song_name"}, inplace=True)
    artist_df_final.rename(columns={"id":"artist_id"}, inplace=True)
    
    return artist_df_final

In [5]:
def prepAlbumStagingTables(raw_playlist_dictionary):
    #trim meta data and extract item field from raw dictionary
    cleaned_playlist_dict = [sub['track'] for sub in raw_playlist_dictionary['items']]
    
    #generate raw dataframe
    playlist_df_raw = pd.read_json(json.dumps(cleaned_playlist_dict))
    
    
    #extract artist column from pd
    album_df = playlist_df_raw.loc[:,["name", "album"]]
    album_df_serialized = album_df["album"].apply(pd.Series)
    album_df_serialized.rename(columns={"name":"album_name"}, inplace=True)
    album_df_serialized.rename(columns={"id":"album_id"}, inplace=True)
    album_df_serialized.rename(columns={"uri":"album_uri"}, inplace=True)

    #extract artist dictionary values into series, and recast to dataframe 
    album_with_artists_df = album_df_serialized.explode("artists")
    album_with_artists_df_serialized = album_with_artists_df["artists"].apply(pd.Series)
    album_with_artists_df_serialized.rename(columns={"name":"artist_name"}, inplace=True)
    album_with_artists_df_serialized.rename(columns={"id":"artist_id"}, inplace=True)

    
    #concat recasted dataframe to original dataframe with song name
    album_df_final = pd.concat([album_with_artists_df_serialized, album_df_serialized, album_df], axis=1)
    album_df_final.rename(columns={"name":"song_name"}, inplace=True)
    album_df_final.rename(columns={"id":"album_id"}, inplace=True)
    
    return album_df_final.loc[:,["album_name", "album_id", "artist_name", "artist_id","song_name","release_date", "album_uri"]]

In [77]:
def generate_song_nodes(raw_playlist_dictionary):
    cleaned_playlist_dict = [sub['track'] for sub in raw_playlist_dictionary['items']]
    playlist_df_raw = pd.read_json(json.dumps(cleaned_playlist_dict))
    
    song_nodes = playlist_df_raw.loc[:,["name", "id", "uri","duration_ms","explicit","popularity"]]
    song_nodes.rename(columns={"name":"song_name"}, inplace=True)
    song_nodes.rename(columns={"id":"song_id"}, inplace=True)
    
    return song_nodes

In [60]:
def generate_artist_nodes(artist_df):
    artist_nodes = artist_df.loc[:,["artist_name", "artist_id"]]
    #artist_nodes.drop_duplicates(["artist_id"], inplace=True)
    
    return artist_nodes

In [61]:
def generate_album_nodes(album_df):
    album_nodes = album_df.loc[:,["album_name", "album_id"]]
    #album_nodes.drop_duplicates(["album_id"], inplace=True)
    
    return album_nodes

## Create Staging Tables

##### Extract data from cooking and gym playlists into staging table for artist-to-song edges

In [89]:
cooking_playlist_artists_df = prepArtistStagingTables(cookingPlaylist)
gym_playlist_artists_df = prepArtistStagingTables(gymPlaylist)

full_playlist_artists_df = pd.concat([cooking_playlist_artists_df, gym_playlist_artists_df])
full_playlist_artists_df

Unnamed: 0,song_name,artist_id,artist_name,type,uri
0,Come Over,7GN9PivdemQRKjDt4z5Zv8,The Internet,artist,spotify:artist:7GN9PivdemQRKjDt4z5Zv8
1,By the Poolside,73dudJ9j0HStIhJDU8MjMI,Feng Suave,artist,spotify:artist:73dudJ9j0HStIhJDU8MjMI
2,Westside,6f96znq79wvlknKHHHhtTW,Jarreau Vandal,artist,spotify:artist:6f96znq79wvlknKHHHhtTW
3,How's It Wrong,6O4EGCCb6DoIiR6B1QCQgp,Toro y Moi,artist,spotify:artist:6O4EGCCb6DoIiR6B1QCQgp
4,Bedroom,2z6JjrrJKNLilqlx8mlxcc,Litany,artist,spotify:artist:2z6JjrrJKNLilqlx8mlxcc
...,...,...,...,...,...
96,Opps (with Yugen Blakrok),3kv1Edgn5HlEWCuEKr1Y9x,Yugen Blakrok,artist,spotify:artist:3kv1Edgn5HlEWCuEKr1Y9x
97,44 More,4xRYI6VqpkE3UwrDrAZL8L,Logic,artist,spotify:artist:4xRYI6VqpkE3UwrDrAZL8L
98,Achoo,6f9bmfkqiYbhCtdAZsv7KI,Keith Ape,artist,spotify:artist:6f9bmfkqiYbhCtdAZsv7KI
98,Achoo,2rhFzFmezpnW82MNqEKVry,Ski Mask The Slump God,artist,spotify:artist:2rhFzFmezpnW82MNqEKVry


##### Extract data from cooking and gym playlists into staging Table for Album to song edges

In [83]:
cooking_playlist_albums_df = prepAlbumStagingTables(cookingPlaylist)
gym_playlist_albums_df = prepAlbumStagingTables(gymPlaylist)

full_album_playlist_df = pd.concat([cooking_playlist_albums_df, gym_playlist_albums_df])

##### Create Staging Tables for Song nodes

In [78]:
gym_song_nodes = generate_song_nodes(gymPlaylist)
gym_song_nodes["playlist_id"] = "1Vu8EHOlNPBFVBxmKtGHlr"
gym_song_nodes["playlist_name"] = "brrrat brrrat"

cooking_playlist_nodes = generate_song_nodes(cookingPlaylist)
cooking_playlist_nodes["playlist_id"] = "5myx61jwlbF2iMDVWNK0Ps"
cooking_playlist_nodes["playlist_name"] = "wrist whippin"

all_song_nodes = pd.concat([gym_song_nodes, cooking_playlist_nodes]).drop_duplicates("song_id")
all_song_nodes

Unnamed: 0,song_name,song_id,uri,duration_ms,explicit,popularity,playlist_id,playlist_name
0,Fight Night,2n5gVJ9fzeX2SSWlLQuyS9,spotify:track:2n5gVJ9fzeX2SSWlLQuyS9,216247,True,68,1Vu8EHOlNPBFVBxmKtGHlr,brrrat brrrat
1,Pipe It Up,6eBrlbv2HMYcldwjoMWIrC,spotify:track:6eBrlbv2HMYcldwjoMWIrC,206266,True,53,1Vu8EHOlNPBFVBxmKtGHlr,brrrat brrrat
2,"Floyd Mayweather (feat. Travis Scott, Gucci Ma...",5ALc7rbru6QOLGodVSDocc,spotify:track:5ALc7rbru6QOLGodVSDocc,358506,True,57,1Vu8EHOlNPBFVBxmKtGHlr,brrrat brrrat
3,With Them,0tISnxqgVmxqhVghsTi2Rr,spotify:track:0tISnxqgVmxqhVghsTi2Rr,197746,True,58,1Vu8EHOlNPBFVBxmKtGHlr,brrrat brrrat
4,F Cancer (Boosie) [feat. Quavo],1qPK58q90l8KNoWyXsLI2Y,spotify:track:1qPK58q90l8KNoWyXsLI2Y,249639,True,49,1Vu8EHOlNPBFVBxmKtGHlr,brrrat brrrat
...,...,...,...,...,...,...,...,...
95,Think,5TocQE0LESbiqk6sd3Y5Hm,spotify:track:5TocQE0LESbiqk6sd3Y5Hm,231927,False,33,5myx61jwlbF2iMDVWNK0Ps,wrist whippin
96,All Yours,76S0dx36oCAMgqfXlVUQeZ,spotify:track:76S0dx36oCAMgqfXlVUQeZ,173586,False,51,5myx61jwlbF2iMDVWNK0Ps,wrist whippin
97,DaNcing in a RoOm,2zTjF6OGxklU5Ulv7iTf4R,spotify:track:2zTjF6OGxklU5Ulv7iTf4R,208605,False,0,5myx61jwlbF2iMDVWNK0Ps,wrist whippin
98,Enginn eins og þú,3bzP8lZJWmuhpdnfDP1rTq,spotify:track:3bzP8lZJWmuhpdnfDP1rTq,181226,False,40,5myx61jwlbF2iMDVWNK0Ps,wrist whippin


##### Create staging tables for album and artist nodes

In [84]:
all_artist_nodes = generate_artist_nodes(full_playlist_artists_df)
all_album_nodes = generate_album_nodes(full_album_playlist_df)

In [85]:
all_artist_nodes

Unnamed: 0,artist_name,artist_id
0,The Internet,7GN9PivdemQRKjDt4z5Zv8
1,Feng Suave,73dudJ9j0HStIhJDU8MjMI
2,Moon Boots,3cIXmCH7iNcslTbwrwS7zy
2,Nic Hanson,1NrFTpkB0RvbVLYl0p5Xvc
3,Jarreau Vandal,6f96znq79wvlknKHHHhtTW
...,...,...
96,Yugen Blakrok,3kv1Edgn5HlEWCuEKr1Y9x
97,Logic,4xRYI6VqpkE3UwrDrAZL8L
98,Keith Ape,6f9bmfkqiYbhCtdAZsv7KI
98,Ski Mask The Slump God,2rhFzFmezpnW82MNqEKVry


In [73]:
album_edges_staging = full_album_playlist_df.merge(all_song_nodes, on=["song_name"])
artist_edges_staging = full_playlist_artists_df.merge(all_song_nodes, on=["song_name"])

##### Export to CSV for loading into Neo4j

In [71]:
all_album_nodes.to_csv("../data/album_nodes.csv")
all_artist_nodes.to_csv("../data/artist_nodes.csv")
all_song_nodes.to_csv("../data/song_nodes.csv")

album_edges_staging.to_csv("../data/album_edges.csv")
artist_edges_staging.to_csv("../data/artist_edges.csv")

## Professor Nelson's Data

Repeat all the steps above for Professor Nelson's Spotify Playlists

In [48]:
profNelsonPlaylists = sp.user_playlists("vyrvddd99x5ejdq4b7ikuzgst")
profNelsonPlaylists

{'href': 'https://api.spotify.com/v1/users/vyrvddd99x5ejdq4b7ikuzgst/playlists?offset=0&limit=50',
 'items': [{'collaborative': False,
   'description': '',
   'external_urls': {'spotify': 'https://open.spotify.com/playlist/1uY4UVKMPbOFj9cdtNc6a6'},
   'href': 'https://api.spotify.com/v1/playlists/1uY4UVKMPbOFj9cdtNc6a6',
   'id': '1uY4UVKMPbOFj9cdtNc6a6',
   'images': [{'height': 640,
     'url': 'https://mosaic.scdn.co/640/ab67616d0000b2730bf8ea3f7755f12f7a45d90eab67616d0000b2734fdb0337978c7464cdfb1783ab67616d0000b2738a29e873f75006aa646db111ab67616d0000b273b065002ec760593ec9dbe305',
     'width': 640},
    {'height': 300,
     'url': 'https://mosaic.scdn.co/300/ab67616d0000b2730bf8ea3f7755f12f7a45d90eab67616d0000b2734fdb0337978c7464cdfb1783ab67616d0000b2738a29e873f75006aa646db111ab67616d0000b273b065002ec760593ec9dbe305',
     'width': 300},
    {'height': 60,
     'url': 'https://mosaic.scdn.co/60/ab67616d0000b2730bf8ea3f7755f12f7a45d90eab67616d0000b2734fdb0337978c7464cdfb1783ab67616

In [49]:
sailing_raw_json = sp.playlist_tracks("1uY4UVKMPbOFj9cdtNc6a6")
bestOfJB_raw_json = sp.playlist_tracks("6pBkmo0Pn8HSjtcudqqGDE")
jbLive_raw_json = sp.playlist_tracks("5QRStOSftF9l9A9Jgi4HlC")

In [50]:
sailing_song_nodes = generate_song_nodes(sailing_raw_json)
bestOfJB_song_nodes = generate_song_nodes(bestOfJB_raw_json)
jbLive_song_nodes = generate_song_nodes(jbLive_raw_json)

sailing_song_nodes["playlist_id"] = "1uY4UVKMPbOFj9cdtNc6a6"
sailing_song_nodes["playlist_name"] = "sailing"

bestOfJB_song_nodes["playlist_id"] = "6pBkmo0Pn8HSjtcudqqGDE"
bestOfJB_song_nodes["playlist_name"] = "Jimmy Buffett Best Of"

jbLive_song_nodes["playlist_id"] = "5QRStOSftF9l9A9Jgi4HlC"
jbLive_song_nodes["playlist_name"] = "Jimmy Buffet: Live"

nelson_all_song_nodes = pd.concat([sailing_song_nodes, bestOfJB_song_nodes, jbLive_song_nodes])

In [51]:
sailing_artists_df = prepArtistStagingTables(sailing_raw_json)
bestOfJB_artists_df = prepArtistStagingTables(bestOfJB_raw_json)
jbLive_artists_df = prepArtistStagingTables(jbLive_raw_json)


nelson_full_playlist_artists_df = pd.concat([sailing_artists_df, bestOfJB_artists_df,jbLive_artists_df])

In [52]:
sailing_album_df = prepAlbumStagingTables(sailing_raw_json)
bestOfJB_album_df = prepAlbumStagingTables(bestOfJB_raw_json)
jbLive_albums_df = prepAlbumStagingTables(jbLive_raw_json)

nelson_full_playlist_albums_df = pd.concat([sailing_album_df, bestOfJB_album_df,jbLive_albums_df])

In [53]:
nelson_all_artist_nodes = generate_artist_nodes(nelson_full_playlist_artists_df)
nelson_all_album_nodes = generate_album_nodes(nelson_full_playlist_albums_df)

nelson_artist_edges_staging = nelson_full_playlist_artists_df.merge(nelson_all_song_nodes, on=["song_name"])
nelson_album_edges_staging = nelson_full_playlist_albums_df.merge(nelson_all_song_nodes, on=["song_name"])

In [54]:
nelson_all_artist_nodes.to_csv("../data/nelson_artist_nodes.csv")
nelson_all_album_nodes.to_csv("../data/nelson_album_nodes.csv")
nelson_all_song_nodes.to_csv("../data/nelson_song_nodes.csv")

nelson_artist_edges_staging.to_csv("../data/nelson_artist_edges.csv")
nelson_album_edges_staging.to_csv("../data/nelson_album_edges.csv")

In [58]:
master_import_file = pd.concat([album_edges_staging,nelson_album_edges_staging])
master_import_file.to_csv("../data/master_import.csv")