In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '7')\
    .getOrCreate()

print('pyspark ready ...')

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [2]:
df = spark.read.load("/data/lastfm-dataset-360K/data-filtered.parquet")

In [22]:
dforig = spark.read.load("/data/lastfm-dataset-360K/data-raw.parquet")
dforig.filter(dforig.userHash=="f37640d943bf48a0ae42058733dd18e48d66f337")\
    .select(dforig.artistName)\
    .show(10)

+--------------------+
|          artistName|
+--------------------+
|   infected mushroom|
|      chase & status|
|       high contrast|
|            syncopix|
|           aphrodite|
|          stereo mcs|
|          underworld|
|bachelors of science|
|  london elektricity|
|                  tc|
+--------------------+
only showing top 10 rows



In [4]:
users = spark.read.load("/data/lastfm-dataset-360K/users.parquet")
users.filter(users.userHash=="f37640d943bf48a0ae42058733dd18e48d66f337").show()
artists = spark.read.load("/data/lastfm-dataset-360K/artists.parquet")

+--------------------+------+
|            userHash|userId|
+--------------------+------+
|f37640d943bf48a0a...| 54113|
+--------------------+------+



In [6]:
from mlonspark import AlternatingLeastSquare
alg = AlternatingLeastSquare()\
    .setUserCol("userId")\
    .setItemCol("artists-artistId")\
    .setRatingCol("listenCount")\
    .setRank(25)\
    .setRegParam(0.0)\
    .setAlpha(10.0)\
    .setImplicitPrefs(True)

model = alg.fit(df)

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan
evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("listenCount")\
    .setPredictionCol("prediction")

predictions = model.transform(df)
predictionsFiltered = predictions.where(~isnan(col("prediction")))  
rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

RMSE = 213.865507


In [8]:
dfSingleUser = df.filter(df.userId==54113)

In [9]:
recommendations = model.recommendForUserSubset(dfSingleUser, 10)

In [12]:
recommendations.printSchema()

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- artists-artistId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [18]:
recs = recommendations.rdd.flatMap(lambda v: v.recommendations).toDF()\
    .withColumnRenamed("artists-artistId", "artistId")

In [20]:
recsFinal = recs\
    .join(artists, recs.artistId==artists.artistId, 'inner')\
    .select(artists.artistName)

recsFinal.show()

+------------------+
|        artistName|
+------------------+
|         logistics|
|   1200 micrograms|
|     high contrast|
|            john b|
|      concord dawn|
|london elektricity|
|             klute|
|         aphrodite|
|          dj fresh|
|         ltj bukem|
+------------------+

