# Apriori Algorithm for Recommending Artist for Spotify Playlist

## Spotify Dataset
----

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import random
import unicodedata

In [115]:
# Load the original dataset
spotify = pd.read_csv("spotify_dataset.csv",
                   names = ["user_id", "artistname", "trackname", "playlistname"], 
                   sep = '\",\"', 
                   engine = 'python',
                   skiprows = 1).dropna().reset_index(drop=True)
spotify

Unnamed: 0,user_id,artistname,trackname,playlistname
0,"""9cc0cfd4d7d7885102480dd99e7a90d6",Elvis Costello,(The Angels Wanna Wear My) Red Shoes,"HARD ROCK 2010"""
1,"""9cc0cfd4d7d7885102480dd99e7a90d6",Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...","HARD ROCK 2010"""
2,"""9cc0cfd4d7d7885102480dd99e7a90d6",Tiffany Page,7 Years Too Late,"HARD ROCK 2010"""
3,"""9cc0cfd4d7d7885102480dd99e7a90d6",Elvis Costello & The Attractions,Accidents Will Happen,"HARD ROCK 2010"""
4,"""9cc0cfd4d7d7885102480dd99e7a90d6",Elvis Costello,Alison,"HARD ROCK 2010"""
...,...,...,...,...
12868976,"""2302bf9c64dc63d88a750215ed187f2c",Mötley Crüe,Wild Side,"iPhone"""
12868977,"""2302bf9c64dc63d88a750215ed187f2c",John Lennon,Woman,"iPhone"""
12868978,"""2302bf9c64dc63d88a750215ed187f2c",Tom Petty,You Don't Know How It Feels,"iPhone"""
12868979,"""2302bf9c64dc63d88a750215ed187f2c",Tom Petty,You Wreck Me,"iPhone"""


### Data Cleaning

In [3]:
# Fix the artist names removing characters
artistname = []
for i in range(len(spotify)):
    artistname.append(unicodedata.normalize('NFKD',spotify["artistname"][i]).encode("ascii","ignore").decode('ascii'))

spotify["artistname"] = artistname

NameError: name 'spotify' is not defined

In [117]:
# Fix the strings for dataset, removing quotation marks
spotify['user_id'] = spotify['user_id'].str.replace('"', '')
spotify['artistname'] = spotify['artistname'].str.replace('"', '')
spotify['trackname'] = spotify['trackname'].str.replace('"', '')
spotify['playlistname'] = spotify['playlistname'].str.replace('"', '')

In [122]:
# Fix the empty string and string nan, removing frmo dataset
spotify = spotify[(spotify['user_id'] != "")]
spotify = spotify[(spotify['artistname'] != "")]
spotify = spotify[(spotify['trackname'] != "")]
spotify = spotify[(spotify['playlistname'] != "")]

spotify = spotify[(spotify['user_id'] != "null")]
spotify = spotify[(spotify['artistname'] != "null")]
spotify = spotify[(spotify['trackname'] != "null")]
spotify = spotify[(spotify['playlistname'] != "null")]

In [123]:
# Checker to make sure all empty values are gone
spotify[(spotify["playlistname"] == "null") | (spotify["playlistname"] == "")]

Unnamed: 0,user_id,artistname,trackname,playlistname


In [64]:
# checking to see the string manipulation worked
spotify

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
...,...,...,...,...
12868976,2302bf9c64dc63d88a750215ed187f2c,Motley Crue,Wild Side,iPhone
12868977,2302bf9c64dc63d88a750215ed187f2c,John Lennon,Woman,iPhone
12868978,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Don't Know How It Feels,iPhone
12868979,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Wreck Me,iPhone


In [134]:
# Export clean dataset
spotify.to_csv('spotify_clean.csv', sep = ";", index=False)

### Dataset Analysis

In [2]:
# Load the clean dataset
spotify = pd.read_csv("spotify_clean.csv", sep = ";", header = 0)
spotify

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
...,...,...,...,...
12865156,2302bf9c64dc63d88a750215ed187f2c,Motley Crue,Wild Side,iPhone
12865157,2302bf9c64dc63d88a750215ed187f2c,John Lennon,Woman,iPhone
12865158,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Don't Know How It Feels,iPhone
12865159,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Wreck Me,iPhone


In [3]:
print("number of unique users:", len(list(spotify.user_id.unique())))
print("number of unique artist:", len(list(spotify.artistname.unique()))) 
print("number of unique playlists:", len(list(spotify.playlistname.unique())))

number of unique users: 15913
number of unique artist: 288158
number of unique playlists: 157289


In [4]:
# Count the number of unique artist in each playlist
uniqueArtisit_playlist = pd.DataFrame({'NumArtistInPlaylist' : 
                                       spotify.groupby(['playlistname'])[
                                           'artistname'].nunique().sort_values(ascending=False)}).reset_index()
uniqueArtisit_playlist.describe()

Unnamed: 0,NumArtistInPlaylist
count,157289.0
mean,22.967156
std,214.379121
min,1.0
25%,1.0
50%,3.0
75%,20.0
max,76500.0


In [5]:
# Count the occurence of each artist
uniqueArtisit = pd.DataFrame({'count':spotify.groupby('artistname')[
                                        'artistname'].count().sort_values(ascending=False)}).reset_index()
uniqueArtisit.describe()

Unnamed: 0,count
count,288158.0
mean,44.646205
std,439.937957
min,1.0
25%,1.0
50%,2.0
75%,10.0
max,36086.0


In [6]:
# how many playlist contain only one artist
uniqueArtisit_playlist[uniqueArtisit_playlist["NumArtistInPlaylist"] == 1]

Unnamed: 0,playlistname,NumArtistInPlaylist
91990,Fat Boy Slim,1
91991,Eels — Meet The EELS: Essential EELS 1996-2006...,1
91992,Ellie Goulding – Bright Lights,1
91993,'Mats,1
91994,Fat Jon – afterthought,1
...,...,...
157284,Longpigs — The Sun Is Often Out,1
157285,Longview,1
157286,Lonnie Johnson – Blues By Lonnie Johnson,1
157287,Lonnie Liston Smith - Explorations: The Columb...,1


## The Apriori Algorithm Implementation

### Data Setup
---

In [21]:
# summary data about the dataframe
uniqueArtisit_playlist.describe()

Unnamed: 0,NumArtistInPlaylist
count,157289.0
mean,22.967156
std,214.379121
min,1.0
25%,1.0
50%,3.0
75%,20.0
max,76500.0


In [8]:
# Subset of the data for number of unique artists in each playlist
uniqueArtist_subset = uniqueArtisit_playlist[(uniqueArtisit_playlist['NumArtistInPlaylist'] > 3) & (uniqueArtisit_playlist['NumArtistInPlaylist'] < 10)]
uniqueArtist_subset

Unnamed: 0,playlistname,NumArtistInPlaylist
60291,toteking,9
60292,valentine,9
60293,Mom & Dad's Kinda Music,9
60294,2 min 160 bpm,9
60295,MIXY MIX APR 14,9
...,...,...
77310,Roar같은곡,4
77311,Prince – The Hits/The B-Sides,4
77312,Agent Sasco – Shellings Riddim,4
77313,9th Wonder,4


In [9]:
# For the subset of number of artist in each playlist
# Group Rows on 'playlistname' column and get List for 'artistname' column
spotifyML = spotify[spotify['playlistname'].isin(uniqueArtist_subset['playlistname'])].groupby('playlistname')['artistname'].agg(['unique']).reset_index()
spotifyML

Unnamed: 0,playlistname,unique
0,Frida,"[Elliot Goldenthal, Traditional, Luis Arcaraz ..."
1,#glgsongaday Day 1: New Beginning,"[The Olivia Tremor Control, David Gray, Lily A..."
2,2,"[Kanye West, D.J. Stevie Tee, K's Choice, Done..."
3,80s Gung 2012,"[Michael McDonald, Russ Ballard, Foreigner, Fl..."
4,ASLA Birthday April 13/13,"[The Black Eyed Peas, Swedish House Mafia, Mad..."
...,...,...
17019,😴,"[The 1975, Phaeleh, The xx, Deptford Goth, Sir..."
17020,🙌🇫🇷,"[Phoenix, Tahiti 80, Breakbot, Uffie, Daft Pun..."
17021,🚀,"[Kasabian, Death From Above 1979, The Futurehe..."
17022,🚇,"[Koan Sound, Future Islands, Classixx, Seapony..."


In [10]:
# For the entire dataset
# Group Rows on 'playlistname' column and get List for 'artistname' column
#spotifyML = spotify.groupby('playlistname')['artistname'].agg(['unique']).reset_index()
spotifyML

Unnamed: 0,playlistname,unique
0,Frida,"[Elliot Goldenthal, Traditional, Luis Arcaraz ..."
1,#glgsongaday Day 1: New Beginning,"[The Olivia Tremor Control, David Gray, Lily A..."
2,2,"[Kanye West, D.J. Stevie Tee, K's Choice, Done..."
3,80s Gung 2012,"[Michael McDonald, Russ Ballard, Foreigner, Fl..."
4,ASLA Birthday April 13/13,"[The Black Eyed Peas, Swedish House Mafia, Mad..."
...,...,...
17019,😴,"[The 1975, Phaeleh, The xx, Deptford Goth, Sir..."
17020,🙌🇫🇷,"[Phoenix, Tahiti 80, Breakbot, Uffie, Daft Pun..."
17021,🚀,"[Kasabian, Death From Above 1979, The Futurehe..."
17022,🚇,"[Koan Sound, Future Islands, Classixx, Seapony..."


In [11]:
# Prepare the data for model
# Store the item sets as tuples of strings in a list
listArtist = []
for i in range(len(spotifyML)):
    listArtist.append(tuple(spotifyML["unique"].loc[i]))

In [12]:
# List output
listArtist

[('Elliot Goldenthal',
  'Traditional',
  'Luis Arcaraz Torras',
  'Los Cojolites',
  'Luis Mars',
  'Tomas Mendez'),
 ('The Olivia Tremor Control',
  'David Gray',
  'Lily Allen',
  'Little Joy',
  'Husker Du',
  'Arcade Fire',
  'The The',
  'The Mountain Goats'),
 ('Kanye West',
  'D.J. Stevie Tee',
  "K's Choice",
  'Done Again',
  'Space',
  'Stuph'),
 ('Michael McDonald',
  'Russ Ballard',
  'Foreigner',
  'Fleetwood Mac',
  'Billy Ocean',
  'Eric Carmen',
  'The Doobie Brothers'),
 ('The Black Eyed Peas',
  'Swedish House Mafia',
  'Madonna',
  'Justin Timberlake',
  'One Direction',
  'will.i.am',
  'Disclosure'),
 ('Unknown', 'Trazan', 'Gullan Bornemark', 'Lasse Berghagen'),
 ('Megadeth',
  'Black Sabbath',
  'White Zombie',
  'Kasabian',
  'Classic Rock Monsters',
  'Bruce Dickinson',
  'Jack White'),
 ('Jay McShann',
  'Lucky Millinder',
  'Mongo Santamaria',
  'Oscar Peterson',
  'Sonny Terry',
  'Chet Baker'),
 ('Patrick Doyle',
  'Julie Fowlis',
  'Mumford & Sons',
  'Emm

In [13]:
len(listArtist)

17024

### Model
----

In [9]:
# https://towardsdatascience.com/the-apriori-algorithm-5da3db9aea95

#pip install efficient-apriori

Collecting efficient-apriori
  Downloading efficient_apriori-2.0.3-py3-none-any.whl (14 kB)
Installing collected packages: efficient-apriori
Successfully installed efficient-apriori-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [14]:
from efficient_apriori import apriori

# our min support is 7, but it has to be expressed as a percentage for efficient-apriori
min_support = 2/len(listArtist) 

# min confidence allows you to delete rules with low confidence.
# For now set min_confidence = 0 to obtain all the rules
min_confidence = 0
itemsets, rules = apriori(listArtist, min_support=min_support, min_confidence=min_confidence)

In [15]:
rules

[{  } -> { },
 { } -> {  },
 {The Smashing Pumpkins} -> { (Crosses)},
 { (Crosses)} -> {The Smashing Pumpkins},
 {Daft Punk} -> {!!!},
 {!!!} -> {Daft Punk},
 {Hot Chip} -> {!!!},
 {!!!} -> {Hot Chip},
 {Jagwar Ma} -> {!!!},
 {!!!} -> {Jagwar Ma},
 {Kavinsky} -> {!!!},
 {!!!} -> {Kavinsky},
 {LCD Soundsystem} -> {!!!},
 {!!!} -> {LCD Soundsystem},
 {Angels & Airwaves} -> {+44},
 {+44} -> {Angels & Airwaves},
 {Box Car Racer} -> {+44},
 {+44} -> {Box Car Racer},
 {Taking Back Sunday} -> {+44},
 {+44} -> {Taking Back Sunday},
 {blink-182} -> {+44},
 {+44} -> {blink-182},
 {Yann Tiersen} -> {10,000 Maniacs},
 {10,000 Maniacs} -> {Yann Tiersen},
 {Blue Swede} -> {10cc},
 {10cc} -> {Blue Swede},
 {Redbone} -> {10cc},
 {10cc} -> {Redbone},
 {Thirty Seconds To Mars} -> {12 Stones},
 {12 Stones} -> {Thirty Seconds To Mars},
 {A$AP Rocky} -> {2 Chainz},
 {2 Chainz} -> {A$AP Rocky},
 {Chris Brown} -> {2 Chainz},
 {2 Chainz} -> {Chris Brown},
 {DJ Drama} -> {2 Chainz},
 {2 Chainz} -> {DJ Drama},


In [16]:
# test output of recommended artists
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)
for rule in sorted(rules_rhs, key=lambda rule: rule.confidence, reverse = True):
    if (rule.lhs == ("Kanye West", )) & (rule.lift > 1):
        print(rule)

{Kanye West} -> {JAY Z} (conf: 0.299, supp: 0.004, lift: 21.768, conv: 1.407)
{Kanye West} -> {Drake} (conf: 0.134, supp: 0.002, lift: 15.294, conv: 1.144)
{Kanye West} -> {Kid Cudi} (conf: 0.110, supp: 0.002, lift: 26.065, conv: 1.119)
{Kanye West} -> {Eminem} (conf: 0.075, supp: 0.001, lift: 8.378, conv: 1.071)
{Kanye West} -> {Beyonce} (conf: 0.067, supp: 0.001, lift: 5.726, conv: 1.059)
{Kanye West} -> {John Legend} (conf: 0.063, supp: 0.001, lift: 12.326, conv: 1.062)
{Kanye West} -> {Kendrick Lamar} (conf: 0.059, supp: 0.001, lift: 11.048, conv: 1.057)
{Kanye West} -> {Big Sean} (conf: 0.051, supp: 0.001, lift: 22.929, conv: 1.052)
{Kanye West} -> {Rihanna} (conf: 0.051, supp: 0.001, lift: 4.378, conv: 1.042)
{Kanye West} -> {DJ Khaled} (conf: 0.047, supp: 0.001, lift: 25.945, conv: 1.048)
{Kanye West} -> {Lupe Fiasco} (conf: 0.047, supp: 0.001, lift: 20.107, conv: 1.047)
{Kanye West} -> {J. Cole} (conf: 0.043, supp: 0.001, lift: 21.684, conv: 1.043)
{Kanye West} -> {The-Dream} (

## Tests

In [98]:
# https://github.com/tommyod/Efficient-Apriori

from efficient_apriori.rules import (
    Rule,
    generate_rules_simple,
    generate_rules_apriori,
)
from efficient_apriori.itemsets import itemsets_from_transactions, ItemsetCount
import unittest 

# running the aprioi algorithm with a small subset of playlists 
listArtist_test = (['Justin Timberlake', 'One Direction', 'will.i.am', 'Disclosure'], 
                   ['5 Seconds of Summer', 'NSYNC', 'One Direction'])
min_support_test = 1/len(listArtist_test) 
min_confidence_test = 0
itemsets_subset, rules_subset = apriori(listArtist_test, min_support=min_support_test, min_confidence=min_confidence_test, 
                                       output_transaction_ids=True)

# defining a rule
r = Rule(('Harry Styles', 'Niall Horan'), ('One Direction',), 75, 100, 125, 250)
r2 = Rule(('Jay-Z', 'Tyler, The Creator', 'Kanye West'), ('Frank Ocean', "Pharrell Williams"), 25, 125, 150, 200)

# tests to ensure the output of the algorithm matches expectations on a subset of the data (one artist)
class TestOutput(unittest.TestCase):
    
    def test_output_subset(self):
        # counts of how many times each artist appears in the first iteration of an itemset, and which playlists they're in
        self.assertEqual(itemsets_subset[1], {('Justin Timberlake',): ItemsetCount(itemset_count=1, members={0}),
                                          ('One Direction',): ItemsetCount(itemset_count=2, members={0, 1}),
                                          ('will.i.am',): ItemsetCount(itemset_count=1, members={0}),
                                          ('Disclosure',): ItemsetCount(itemset_count=1, members={0}),
                                          ('5 Seconds of Summer',): ItemsetCount(itemset_count=1, members={1}),
                                          ('NSYNC',): ItemsetCount(itemset_count=1, members={1})})

    # test lhs and rhs rule functions
    def test_rule_functions(self):
        self.assertEqual(r.lhs, ('Harry Styles', 'Niall Horan'))
        self.assertEqual(r.rhs, ('One Direction',))
        self.assertEqual(r.confidence, 0.75)
        self.assertEqual(r2.lhs, ('Jay-Z', 'Tyler, The Creator', 'Kanye West'))
        self.assertEqual(r2.rhs, ('Frank Ocean', "Pharrell Williams"))
        self.assertEqual(r2.confidence, 0.2)
        self.assertEqual(r, Rule(('Harry Styles', 'Niall Horan'), ('One Direction',)))
        self.assertEqual(r2, Rule(('Jay-Z', 'Tyler, The Creator', 'Kanye West'), ('Frank Ocean', 'Pharrell Williams')))

# run tests
res = unittest.main(argv = [''], verbosity = 3, exit = False)
assert len(res.result.failures) == 0

test_output_subset (__main__.TestOutput) ... ok
test_rule_functions (__main__.TestOutput) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.004s

OK


## GUI Implementation

In [None]:
import matplotlib
matplotlib.use('Agg')
import tkinter
from tkinter import *

# https://www.geeksforgeeks.org/python-gui-tkinter/
# https://tkdocs.com/pyref/stringvar.html

master = Tk()
master.title('Recommender')

# defining canvas dimensions
canvas = Canvas(master, width = 300, height = 10)
canvas.pack()

# adding a label to the canvas
label = Label(master, text = 'Type in the name of a musical artist')
label.pack()

# collecting user input
var = StringVar()
   
entry = Entry(master, textvariable = var)
entry.focus_set()
entry.pack()

# adding a button to submit to the canvas
button = Button(master, text = 'Submit', width = 25, command = master.destroy)
button.pack()

master.mainloop()

# gather recommended artists from the algorithm
recommended = list()
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)
for rule in sorted(rules_rhs, key = lambda rule: rule.confidence, reverse = True):
    if (rule.lhs == (var.get(), )) & (rule.lift > 1):
        recommended.append(rule.rhs[0])
        print(rule)
        
# compile recommendations into a string
recommendations = ""
if len(recommended) < 5:
    for i in range(len(recommended)):
        recommendations = recommendations + recommended[i] + '\n'
else:
    for i in range(5):
        recommendations = recommendations + recommended[i] + '\n'

# printing out the recommendations or an error message if artist does not exist
if len(recommended) > 0:
    output = Tk()
    output.title('Recommender')
    canvas2 = Canvas(output, width = 500, height = 10)
    canvas2.pack()
    w = Label(output, text = "Recommendations:" + "\n" + recommendations)
    w.pack()
    output.mainloop()
else: 
    output = Tk()
    output.title('Error')
    w = Label(output, text = "Sorry that artist does not exist in our database. Please exit try a new artist."
    w.pack()
    output.mainloop()