In [1]:
from pyspark import SparkContext
sc = SparkContext("local[2]", "Music Recommendation")

In [2]:
#Reading user data
rawUserArtistData=sc.textFile("user_artist_data_small.txt").map(lambda x:x.split()).map(lambda x:[int(x[0]),int(x[1]),int(x[2])])

In [3]:
#Reading Artist Data
rawArtistData=sc.textFile("artist_data_small.txt").map(lambda x:x.split("\t")).map(lambda x:[int(x[0]),x[1].strip()])

In [4]:
#user ID, an artist ID, and a play count
rawUserArtistData.take(1)

[[1059637, 1000010, 238]]

In [5]:
#ArtistID,Name
rawArtistData.take(1)

[[1240105, 'André Visior']]

In [6]:
#Artist Alias Name
artistAlias=sc.textFile("artist_alias_small.txt").map(lambda x:x.split("\t"))

In [7]:
artistAlias.take(1)

[['1027859', '1252408']]

In [8]:
#sample Artist Names
rawArtistData.filter(lambda x:str(x[0]) in ['1027859', '1252408']).collect()

[[1252408, 'Trevor Jones & Randy Edelman'],
 [1027859, 'Trevor Jones and Randy Edelman']]

In [9]:
#Finding the Alias of artist id if present else artist id
def final_artist(y):
    if y in mnk.keys():
        return mnk[y]
    else:
        return y

In [10]:
artistAlias.count()

587

In [11]:
#broadcasting the artist Name
mnk=dict(sc.broadcast(artistAlias.collect()).value)

In [12]:
##Building Ratings RDD
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
ratings=rawUserArtistData.map(lambda x: Rating(x[0],final_artist(x[1]),x[2]))

In [13]:
#Sample Ratings RDD
ratings.take(1)

[Rating(user=1059637, product=1000010, rating=238.0)]

In [26]:
#Training Model with rating and 10 iterations
rank = 10
numIterations = 10
model = ALS.trainImplicit(ratings, rank, numIterations,0.1,alpha=0.1)

In [29]:
#Training Model with rating and 10 iterations
rank = 10
numIterations = 10
model = ALS.trainImplicit(ratings, rank)

In [30]:
# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 9224142.20900244


In [28]:
#Artists Recommended for Users
user_recommendation=model.recommendProducts(1070932,10)
artists=[]
for i in user_recommendation:
    artists.append(i[1])
artists_names=rawArtistData.filter( lambda x:x[0] in artists).map(lambda x:x[1]).collect()
for i in artists_names[:9]:
    print(i)


Azure Ray
Mates of State
Bishop Allen
Tilly and the Wall
Insane Clown Posse
Nichole Nordeman
Rise Against
Ashlee Simpson
Ginny Owens


In [17]:
#Artists listened by user
artists_for_user=rawUserArtistData.filter(lambda x:x[0]==1070932).map(lambda x:x[1]).collect()
artists_names=rawArtistData.filter( lambda x:x[0] in artists_for_user).map(lambda x:x[1]).collect()
for i in artists_names[:9]:
    print(i)

time and distance
Zoli Band
desperate measures
Scraps And Heart Attacks
Razor Crusade
Staygold
Midtown
Minor Threat
The Reputation


In [18]:
#Sample Predictions
predictions.take(10)

[((1024631, 7009918), 0.9574341347765185),
 ((1059334, 10103194), 0.9682550082666361),
 ((1059334, 6682484), 0.9660616664234093),
 ((2023686, 1002836), 0.9474381829936663),
 ((1031009, 1002836), 0.15710094271201763),
 ((1024631, 2136096), 0.9574341347765185),
 ((1070932, 1281578), 0.1493135480650973),
 ((1059334, 2146068), 0.9663039617082493),
 ((2023686, 1154200), 0.949529268513658),
 ((1035511, 6660614), 0.8831503884054149)]

In [31]:
# Evaluate the model on training data
import math
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: math.sqrt((r[1][0] - r[1][1])**2)).mean()
print("Mean Squared Error = " + str(MSE))

# Save and load model
model.save(sc, "target/tmp/myCollaborativeFilter3")
sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter3")

Mean Squared Error = 129.91350043907113
