In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

### Reading user-artist interaction data

In [2]:
user_artist_data_path = "data/audioscrobbler/user_artist_data.txt"
raw_user_artist_data = spark.read.text(user_artist_data_path)

In [None]:
raw_user_artist_data.show(3) # user ID, artist ID, and play count

`raw_user_artist_data` is like a list of strings. Not very useful. We parse it into a DataFrame with 3 integer-type columns.

In [4]:
raw_user_artist_data.createOrReplaceTempView("raw_user_artist_data")

query = """
    select 
        cast(split(value, ' ')[0] as int) as user,
        cast(split(value, ' ')[1] as int) as artist,
        cast(split(value, ' ')[2] as int) as count
    from raw_user_artist_data
"""
user_artist_data = spark.sql(query)

In [None]:
user_artist_data.show(3)

### Reading artist aliases

In [8]:
artist_alias_path = "data/audioscrobbler/artist_alias.txt"
raw_artist_alias = spark.read.text(artist_alias_path)

raw_artist_alias.createOrReplaceTempView("raw_artist_alias")

query = """
    select 
        cast(split(value, '\t')[0] as int) as artist,
        cast(split(value, '\t')[1] as int) as alias
    from raw_artist_alias
"""
artist_alias = spark.sql(query)

In [None]:
artist_alias.show(3)

### Prepare the training and testing data

Using `artist_alias` to replace artist IDs by their appropriate alias.

In [40]:
from pyspark.sql.functions import when, col

user_artist_data.createOrReplaceTempView("user_artist_data")
artist_alias.createOrReplaceTempView("artist_alias")

df = spark.sql("""
    select * 
    from user_artist_data 
        natural left join artist_alias;
""") 

df = df.withColumn(
    "artist", # column name
    when(col("alias").isNull(), col("artist")).otherwise(col("alias")) # when(condition, value).otherwise(value)
).drop("alias")

In [61]:
df = df.sample(0.02) # only taking 2% of the data because I'm poor
train, test = df.randomSplit([8.0, 2.0])

### Training the model

In [None]:
train.cache()

In [None]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol='user', itemCol='artist', ratingCol='count')
model = als.fit(train)

In [None]:
predictions = model.transform(test)
predictions.na.drop().show(5)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="count")
evaluator.evaluate(predictions.na.drop())

### Hyperparameter tuning

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [1, 5, 10]) \
    .addGrid(als.maxIter, [5, 10]) \
    .addGrid(als.regParam, [0.05, 0.1]) \
    .build()

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator)

cv_model = cv.fit(train)
predictions = cv_model.transform(test)
evaluator.evaluate(predictions.na.drop())