In [46]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("OFF")

### Reading user-artist interaction data

In [47]:
user_artist_data_path = "data/audioscrobbler/user_artist_data.txt"
raw_user_artist_data = spark.read.text(user_artist_data_path)

In [48]:
raw_user_artist_data.show(3) # user ID, artist ID, and play count

+------------------+
|             value|
+------------------+
|      1000002 1 55|
|1000002 1000006 33|
| 1000002 1000007 8|
+------------------+
only showing top 3 rows



`raw_user_artist_data` is like a list of strings. Not very useful. We parse it into a DataFrame with 3 integer-type columns.

In [49]:
raw_user_artist_data.createOrReplaceTempView("raw_user_artist_data")

query = """
    select 
        cast(split(value, ' ')[0] as int) as user,
        cast(split(value, ' ')[1] as int) as artist,
        cast(split(value, ' ')[2] as int) as count
    from raw_user_artist_data
"""
user_artist_data = spark.sql(query)

In [50]:
user_artist_data.show(3)

+-------+-------+-----+
|   user| artist|count|
+-------+-------+-----+
|1000002|      1|   55|
|1000002|1000006|   33|
|1000002|1000007|    8|
+-------+-------+-----+
only showing top 3 rows



### Reading artist aliases

In [51]:
artist_alias_path = "data/audioscrobbler/artist_alias.txt"
raw_artist_alias = spark.read.text(artist_alias_path)

raw_artist_alias.createOrReplaceTempView("raw_artist_alias")

query = """
    select 
        cast(split(value, '\t')[0] as int) as artist,
        cast(split(value, '\t')[1] as int) as alias
    from raw_artist_alias
"""
artist_alias = spark.sql(query)

In [52]:
artist_alias.show(3)

+-------+-------+
| artist|  alias|
+-------+-------+
|1092764|1000311|
|1095122|1000557|
|6708070|1007267|
+-------+-------+
only showing top 3 rows



### Prepare the training and testing data

Using `artist_alias` to replace artist IDs by their appropriate alias.

In [None]:
from pyspark.sql.functions import when, col

user_artist_data.createOrReplaceTempView("user_artist_data")
artist_alias.createOrReplaceTempView("artist_alias")

df = spark.sql("""
    select * 
    from user_artist_data 
        natural left join artist_alias;
""")
df.createOrReplaceTempView('matrix')

M = spark.sql("""
    select user, count, (case when alias is null then artist else alias end) as artist
    from matrix
""")

In [54]:
M = M.sample(0.02) # only taking 2% of the data because I'm poor
train, test = M.randomSplit([8.0, 2.0])

### Training the model

In [55]:
train.cache()

DataFrame[user: int, count: int, artist: int]

In [56]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol='user', itemCol='artist', ratingCol='count')
model = als.fit(train)

                                                                                

In [64]:
predictions = model.transform(test)
predictions.na.drop().show(10)

[Stage 3109:>                                                       (0 + 1) / 1]

+-------+-----+-------+-----------+
|   user|count| artist| prediction|
+-------+-----+-------+-----------+
|  10016|    1|1021808|-0.96562994|
|  10027|    2|1033211| 0.15959628|
|  10137|    1|1009147| -2.3264525|
|  10311|    1|1026533| -3.4218102|
|  10311|    2|   4455| -23.744621|
|1000002|    1|1000716| -22.899334|
|1000002|   19|   4324|  6.2149944|
|1000019|    1|   1194|  -4.168953|
|1000019|    1|1004226|  1.2598414|
|1000019|    5|    425|   2.562903|
+-------+-----+-------+-----------+
only showing top 10 rows



                                                                                

In [65]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="count")
rmse = evaluator.evaluate(predictions.na.drop())
print(rmse)

[Stage 3163:>                                                       (0 + 8) / 8]

88.87805079889526


                                                                                

### Hyperparameter tuning 
Not in lab manual.

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [1, 5, 10]) \
    .addGrid(als.maxIter, [5, 10]) \
    .addGrid(als.regParam, [0.05, 0.1]) \
    .build()

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator)
cv_model = cv.fit(train)

[Stage 5842:>                                                       (0 + 8) / 8]

280.4069919793438
CrossValidatorModel_80b85c43b165


                                                                                

In [67]:
predictions = cv_model.transform(test)
rmse = evaluator.evaluate(predictions.na.drop())
print(rmse)

[Stage 5877:>                                                       (0 + 8) / 8]

280.4069919793438


                                                                                

In [68]:
model = cv_model.bestModel
print(model)

ALSModel: uid=ALS_749749e5c52e, rank=1
