In [1]:
# Reference: https://towardsdatascience.com/building-spotifys-discover-weekly-with-spark-4370d5d0df2f
from pyspark.mllib import recommendation
from pyspark.mllib.recommendation import *

In [2]:
rawUserArtistData = sc.textFile("D:/HKUST/Big Data Computing/Project/Reference/MusicRecommendation-Spark--master/user_artist_data_small.txt")
rawArtistData = sc.textFile("D:/HKUST/Big Data Computing/Project/Reference/MusicRecommendation-Spark--master/artist_data_small.txt")
rawArtistAlias = sc.textFile("D:/HKUST/Big Data Computing/Project/Reference/MusicRecommendation-Spark--master/artist_alias_small.txt")

# Preprocess data

We want to obtain a list of raw artist data with each ID and name stored in tuples. Lets use artist_data.txt to create this list.

In [3]:
def pairsplit(singlePair):
    splitPair = singlePair.rsplit('\t')
    if len(splitPair) != 2:
        return []
    else:
        try:
            return [(int(splitPair[0]), splitPair[1])]
        except:
            return []
artistByID = dict(rawArtistData.flatMap(lambda x: pairsplit(x)).collect())

In [4]:
artistByID

{1240105: 'André Visior',
 1240113: 'riow arai',
 1240132: 'Outkast & Rage Against the Machine',
 6776115: '小松正夫',
 1030848: "Raver's Nature",
 6671601: 'Erguner, Kudsi',
 1106617: 'Bloque',
 1240185: 'Lexy & K. Paul',
 6671631: 'Rev. W.M. Mosley',
 6671632: 'Labelle, Patti',
 1240238: 'the Chinese Stars',
 1240262: 'The Gufs',
 6718605: 'Bali Music',
 6828988: 'Southern Conference Featuring Dr. Ace',
 1240415: 'Paul & Paula',
 1009439: 'Cinnamon',
 1018275: 'School Of Fish',
 6671680: 'Armstrong, Louis & His Hot Five',
 1240508: 'The Ozark Mountain Daredevils',
 1240510: 'The Mercury Program',
 1240516: 'Del Close & John Brent',
 1002584: 'Nena',
 6990766: 'Phil Hendrie - 11/06/98',
 1240554: 'Ami Yoshida',
 1124756: 'utabi',
 10023740: 'Red & Blue feat. Cathy Dennis',
 1240589: 'Sebastian Bach & Friends',
 1240603: 'The Wake',
 6748187: 'Eric Darling',
 1238620: 'Juno Reactor, Don Davis',
 10585028: '大友良英ニュー・ジャズ・クインテット',
 10113150: 'wouter van veldhoven',
 3055: 'Montag',
 1240848: '

We also use artist_alias.txt to map “bad” artist IDs to “good” ones, instead of just using it as raw data pairs of artist IDs. We convert bad IDs to good ones using the codes below. The first entry, for instance, maps ID 6803336 to 1000010, which means it maps “Aerosmith (unplugged)” to “Aerosmith.”

In [5]:
def aliaslookup(alias):
    splitPair = alias.rsplit('\t')
    if len(splitPair) != 2:
        return []
    else:
        try:
            return [(int(splitPair[0]), int(splitPair[1]))]
        except:
            return []
artistAlias = rawArtistAlias.flatMap(lambda x: aliaslookup(x)).collectAsMap()

In [6]:
artistAlias

{1027859: 1252408,
 1017615: 668,
 6745885: 1268522,
 1018110: 1018110,
 1014609: 1014609,
 6713071: 2976,
 1014175: 1014175,
 1008798: 1008798,
 1013851: 1013851,
 6696814: 1030672,
 1036747: 1239516,
 1278781: 1021980,
 2035175: 1007565,
 1327067: 1308328,
 2006482: 1140837,
 1314530: 1237371,
 1160800: 1345290,
 1255401: 1055061,
 1307351: 1055061,
 1234249: 1005225,
 6622310: 1094137,
 1261919: 6977528,
 2103190: 1002909,
 9929875: 1009048,
 2118737: 1011363,
 9929864: 1000699,
 6666813: 1305683,
 1172822: 1127113,
 2026635: 1001597,
 6726078: 1018408,
 1039896: 1277013,
 1239168: 1266817,
 6819291: 1277876,
 2030690: 2060894,
 6786886: 166,
 1051692: 1307569,
 1239193: 1012079,
 1291581: 78,
 6642817: 1010969,
 1293171: 1007614,
 1070350: 1034635,
 6603691: 1279932,
 1027851: 1063053,
 2060513: 2029258,
 1277348: 668,
 1253023: 1033862,
 1002892: 1002451,
 2060435: 1256876,
 6612396: 1301739,
 1280154: 1021970,
 6617155: 1039381,
 1006102: 1034635,
 6697417: 2013670,
 1059007: 265

# Build a model

We then create a lookup function to convert the data into Rating objects. Note that any underlying MLlib models would need products as an objective measure. In our model, the products are artists. We will therefore use user_artist_data.txt for implementing our train data.

In [7]:
bArtistAlias = sc.broadcast(artistAlias)

In [8]:
def ratinglookup(x):
    userID, artistID, count = map(lambda line: int(line), x.split())
    finalArtistID = bArtistAlias.value.get(artistID)
    if finalArtistID is None:
        finalArtistID = artistID
    return Rating(userID, finalArtistID, count)
trainData = rawUserArtistData.map(lambda x: ratinglookup(x))
trainData.cache()

PythonRDD[8] at RDD at PythonRDD.scala:53

In [9]:
trainData.take(10)

[Rating(user=1059637, product=1000010, rating=238.0),
 Rating(user=1059637, product=1000049, rating=1.0),
 Rating(user=1059637, product=1000056, rating=1.0),
 Rating(user=1059637, product=1000062, rating=11.0),
 Rating(user=1059637, product=1000094, rating=1.0),
 Rating(user=1059637, product=1000112, rating=423.0),
 Rating(user=1059637, product=1000113, rating=5.0),
 Rating(user=1059637, product=1000114, rating=2.0),
 Rating(user=1059637, product=1000123, rating=2.0),
 Rating(user=1059637, product=1000130, rating=19129.0)]

Finally, we build our model using collaborative filtering algorithm as follows. The operation will likely take minutes or more depending on your cluster. It took me around 15 minutes to run the model.

In [10]:
model = ALS.trainImplicit(trainData, 10, 5)

We should first see if the artist recommendations make any intuitive sense, by examining a user, his or her plays, and recommendations for that user. Take, for example, user 2093760. Extract the IDs of artists that this user has listened to and print their names. This means searching the input for artist IDs for this user, and then filtering the set of artists by these IDs so you can collect and print the names in order:

In [14]:
spotcheckingID = 1059637
bArtistByID = sc.broadcast(artistByID)
rawArtistsForUser = (trainData
                  .filter(lambda x: x.user == spotcheckingID)
                  .map(lambda x: bArtistByID.value.get(x.product))
                  .collect())
print(rawArtistsForUser)

['Aerosmith', "Edna's Goldfish", 'The Mighty Mighty Bosstones', 'Foo Fighters', 'The Bouncing Souls', 'Alkaline Trio', 'The Beatles', 'Pennywise', 'Incubus', 'Bright Eyes', 'Muse', 'Jason Mraz', 'Jimmy Eat World', 'Meat Loaf', 'The Lightning Seeds', 'MxPx', 'At the Drive-In', 'New Found Glory', 'Blind Melon', 'Chicago', 'Hoobastank', 'Loreena McKennitt', 'Moulin Rouge', 'Goo Goo Dolls', 'Spin Doctors', 'Dave Matthews Band', 'Cast', 'Mundy', 'Ben Kweller', 'Wham!', 'Third Eye Blind', 'Sugarcult', 'The Ataris', 'Descendents', 'Strung Out', 'The Muffs', 'Kenny Loggins', 'Fountains of Wayne', 'Sheryl Crow', 'The Cranberries', 'blink-182', 'OMC', 'Funeral for a Friend', 'Finch', 'Saves the Day', 'The Starting Line', 'Modest Mouse', 'Simple Minds', 'A Flock Of Seagulls', 'The Hippos', 'Something Corporate', 'Taking Back Sunday', 'The Movielife', 'The Darkness', 'Phantom Planet', 'Sunny Day Real Estate', 'Saturday Night Live', 'Guster', 'Good Charlotte', 'Cyndi Lauper', 'Nena', '[unknown]', '

In [18]:
recommendations = map(lambda x: artistByID.get(x.product), model.call("recommendProducts", spotcheckingID, 10))
print(list(recommendations))

['Green Day', 'My Chemical Romance', 'The Starting Line', 'Something Corporate', 'Bright Eyes', 'Taking Back Sunday', 'Straylight Run', 'New Found Glory', 'Thrice', 'Incubus']
