In [2]:
# standard imports
import numpy as np
import pandas as pd
import networkx as nx

# for data exploration, cleaning, and visualization
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
raw = pd.read_csv('spotify_dataset.csv', on_bad_lines='skip', engine='python')

In [4]:
raw.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


Renaming column names, doing some quick initial exploration of the data

In [5]:
# define a function to rename the columns
def rename_cols(df):
    columns = ['user_id', 'artist_name', 'track_name', 'playlist_name']
    df.columns = columns
    return df

data = rename_cols(raw)
data.head()

Unnamed: 0,user_id,artist_name,track_name,playlist_name
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [6]:
data.shape

(302639, 4)

In [7]:
data.describe()

Unnamed: 0,user_id,artist_name,track_name,playlist_name
count,302639,302229,302636,302639
unique,407,27792,154139,5192
top,61baddf7207fea410abdc56e680fa869,Johnny Cash,Intro,Starred
freq,18835,1128,164,30622


In [8]:
data.artist_name.value_counts()

Johnny Cash                            1128
Coldplay                               1127
Red Hot Chili Peppers                  1117
Daft Punk                              1097
Arctic Monkeys                         1003
                                       ... 
Gerardo Iacoucci                          1
Jamin Winans                              1
Michelle Pfeiffer                         1
Magic Philharmonic Disney Orchestra       1
The Finn Brothers                         1
Name: artist_name, Length: 27792, dtype: int64

In [9]:
data.playlist_name.value_counts()

Starred                             30622
Everything at once                   7877
Spotify Library                      7640
Strane                               6812
Rich's iPhone                        4603
                                    ...  
¿Con quién se queda el perro?           1
DMX — Where The Hood At                 1
Mindy Smith                             1
Dangerous (feat. Howard Jones) 2        1
Never Say Never                         1
Name: playlist_name, Length: 5192, dtype: int64

We want to connect two artists if they are placed in the same playlist.

In [10]:
# sort the playlists by playlist
data = data.sort_values(by=['playlist_name'])

playlists = {}
# create a dictionary of playlist : [*artists] key-value pairs
for artist, playlist in zip(data.artist_name, data.playlist_name):
    if playlist not in playlists:
        playlists[playlist] = [artist]
    else:
        # check if this artist is already in the list
        if artist not in playlists[playlist]:
            playlists[playlist].append(artist)

In [30]:
playlists

{' 3 point shot- jerry bergonzi': ['Jerry Bergonzi'],
 ' Chopin Complete Edition': ['Frédéric Chopin'],
 ' DMB - Live in Rio': ['Dave Matthews Band'],
 ' Diamond Jubilee - A Classical Celebration': ['Della Jones',
  'Bournemouth Symphony Orchestra',
  'Waynflete Singers',
  'Philip Jones Ensemble',
  'Academy of St. Martin in the Fields',
  'East of England Orchestra',
  'The Royal Choral Society',
  'BBC Concert Orchestra',
  'Martin Loveday',
  'Robert Ferriman',
  'London Symphony Chorus'],
 ' Hercules Original Soundtrack': ['Ricky Martin',
  'Danny DeVito',
  'Alan Menken',
  'Jocelyn Brown',
  'Lillias White',
  'Roz Ryan',
  'Boyzone',
  'William Shakespeare',
  'Roger Bart',
  'Sounds Of Blackness',
  'Belinda Carlisle',
  'Charlton Heston'],
 ' Punk-O-Rama': ['Sage Francis',
  'I Against I',
  'NOFX',
  'Danger Doom',
  'The Offspring',
  'This Is Me Smiling',
  'Millencolin',
  'The Humpers',
  'Guttermouth',
  'Ten Foot Pole',
  'Dropkick Murphys',
  'Straight Faced',
  'Horr

In [None]:
# define undirected graph
G = nx.Graph()
# create all the nodes, each one representing an artist
for playlist, artist_list in playlists.items():
    G.add_nodes_from(artist_list)
    # create the edge list where we fully connect all the nodes to each other
    # edge_list = get_edges(artist_list)
    # G.add_edges_from(edge_list)