# Audio Recommender
### Team Members: 
### 1. Basil Vetas (bsv2111)
### 2. Daniel First (df2450)
### 3. Elizabet Doliar (ed2758)
### 4. Alexandra Sudomoeva (as5402)

# Check SparkContext

In [1]:
sc

In [2]:
# import necessary packages
import numpy as np

# Preparing the Data

3 columns: userid artistid playcount

Data is kept in the format of "user id", "artist id", "count"

In [3]:
raw_user_artist_data = sc.textFile("data/user_artist_data.txt")
#compute statistics on user id
user_artist_data = raw_user_artist_data.map(lambda l: l.split(" "))

Data is kept in the format of "artist id", "artist name"

In [4]:
raw_artist_data = sc.textFile("data/artist_data.txt")

def dropbad(line):
    l = line.split("\t")
    if len(l) == 2:
        try:
            int(l[0])
        except ValueError:
            return False
        else:
            return (l[0] != '') and (l[1] != '')
    else:
        return False

artist_by_id = raw_artist_data.filter(lambda l: dropbad(l))
artist_by_id = artist_by_id.map(lambda l: (int(l.split("\t")[0]), l.split("\t")[1].strip()))

In [5]:
artist_by_id.take(5)

[(1134999, u'06Crazy Life'),
 (6821360, u'Pang Nakarin'),
 (10113088, u'Terfel, Bartoli- Mozart: Don'),
 (10151459, u'The Flaming Sidebur'),
 (6826647, u'Bodenstandig 3000')]

Data is kept in the format of "badid", "goodid"

In [6]:
raw_artist_alias = sc.textFile("data/artist_alias.txt")
artist_alias = raw_artist_alias.map(lambda l: (l.split("\t")[0], l.split("\t")[1]))
artist_alias = artist_alias.filter(lambda l: (l[0] != '') and (l[1] != ''))
artist_alias = artist_alias.map(lambda l: (int(l[0]), int(l[1])))

In [7]:
artist_alias.take(5)

[(1092764, 1000311),
 (1095122, 1000557),
 (6708070, 1007267),
 (10088054, 1042317),
 (1195917, 1042317)]

In [8]:
artist_alias = artist_alias.collectAsMap()

# Building The Model

In [15]:
import pyspark.mllib.recommendation
from pyspark.mllib.recommendation import *

In [16]:
# create broadcast variable with dict of bad and good ids
b_artist_alias = sc.broadcast(artist_alias)

# create a train data subset
train_data = raw_user_artist_data.map(lambda l: (int(l.split(" ")[0]), int(l.split(" ")[1]), int(l.split(" ")[2])))

# correct for bad id's using broadcast variable
train_data = train_data.map(lambda l: (l[0], (b_artist_alias.value.get(l[1], l[1])), l[2]))

In [17]:
train_data.take(5)

[(1000002, 1, 55),
 (1000002, 1000006, 33),
 (1000002, 1000007, 8),
 (1000002, 1000009, 144),
 (1000002, 1000010, 314)]

In [18]:
# lastly, build the model (using the finalized parameters form the textbook right away)
model = ALS.trainImplicit(train_data, 50, 5, 1.0, 40)

In [19]:
# see somee feature vectors
model.userFeatures().map(lambda l: ", ".join([str(x) for x in l])).first()

"120, array('d', [0.0005237431032583117, 0.0004703843442257494, 0.0014566885074600577, -0.0006635559839196503, -0.00020361749920994043, -0.0003671517479233444, -0.0003718723892234266, -0.0004033832810819149, -0.00021437308168970048, 0.002071031602099538, -0.0009212664444930851, 0.0006792983622290194, 0.0007889423286542296, -0.0013239282416179776, -0.0009274594485759735, 0.002002900931984186, -8.646740752737969e-05, 0.0006754693458788097, -0.001245456631295383, -0.0006359285325743258, 0.0013886566739529371, 0.0004942137165926397, -0.0013273025397211313, -0.0015008763875812292, -0.0019119471544399858, 0.0015779894310981035, 0.0014287496451288462, 0.0003408659831620753, -0.0008630104130133986, 0.001232461421750486, -0.0005332815926522017, 0.0010429283138364553, -0.0011241633910685778, 0.0003035206173080951, 0.001245268969796598, -0.000546677561942488, -0.0006837468245066702, 0.0009355540387332439, 0.00032499339431524277, -0.00030262969085015357, 0.0018360851099714637, 0.001875509042292833

## Spot Checking Recommendations

In [20]:
# find lines whose user is 2093760
raw_artists_for_user = raw_user_artist_data.map(lambda l: l.split(' ')).filter(lambda l: int(l[0]) == 2093760)

In [21]:
raw_artists_for_user.take(5)

[[u'2093760', u'1180', u'1'],
 [u'2093760', u'1255340', u'3'],
 [u'2093760', u'378', u'1'],
 [u'2093760', u'813', u'2'],
 [u'2093760', u'942', u'7']]

In [22]:
# collect unique artists
existing_products = set(raw_artists_for_user.map(lambda l: int(l[1])).collect())

In [23]:
existing_products

{378, 813, 942, 1180, 1255340}

In [24]:
# filter in those artists, get just artist, and print
artists = artist_by_id.filter(lambda l: l[0] in existing_products).collect()

for a in artists:
    print a[1]

David Gray
Blackalicious
Jurassic 5
The Saw Doctors
Xzibit


In [25]:
recommendations = model.recommendProducts(2093760, 5)

for r in recommendations:
    print r

Rating(user=2093760, product=1002095, rating=0.0010288296135225262)
Rating(user=2093760, product=6707474, rating=0.000881346625919118)
Rating(user=2093760, product=1137790, rating=0.0007521678076580716)
Rating(user=2093760, product=6790283, rating=0.0006663027795484252)
Rating(user=2093760, product=2106072, rating=0.000655114189134187)


In [26]:
recommended_product_ids = set(map(lambda l: l.product, recommendations))

In [27]:
recommended_artists = artist_by_id.filter(lambda l: l[0] in recommended_product_ids).collect()

for r_a in recommended_artists:
    print r_a[1]

Something Corporate
Julian Theory
Matthew Walker
Domestic Disturbance
The NSG


## Making Recommendations

In [28]:
recommendations = model.recommendProducts(2093760, 10)

for r in recommendations:
    print r

Rating(user=2093760, product=1002095, rating=0.0010288296135225262)
Rating(user=2093760, product=6707474, rating=0.000881346625919118)
Rating(user=2093760, product=1137790, rating=0.0007521678076580716)
Rating(user=2093760, product=6790283, rating=0.0006663027795484252)
Rating(user=2093760, product=2106072, rating=0.000655114189134187)
Rating(user=2093760, product=1074252, rating=0.000591530832306063)
Rating(user=2093760, product=1054637, rating=0.0005780886481855233)
Rating(user=2093760, product=1096232, rating=0.0005743829204416402)
Rating(user=2093760, product=6859227, rating=0.0005698285058338971)
Rating(user=2093760, product=1294291, rating=0.0005576622196469571)


In [29]:
recommended_product_ids = set(map(lambda l: l.product, recommendations))

In [30]:
recommended_artists = artist_by_id.filter(lambda l: l[0] in recommended_product_ids).collect()

for r_a in recommended_artists:
    print r_a[1]

The Nightmare Scenario
Something Corporate
Julian Theory
Matthew Walker
Whatever It Takes
Peter Brame
Domestic Disturbance
The NSG
reaching forward
Lonely Kings
