In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '7')\
    .getOrCreate()

print('pyspark ready ...')

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [2]:
df = spark.read.load("/data/lastfm-dataset-360K/data-final.parquet")

In [3]:
dforig = spark.read.load("/data/lastfm-dataset-360K/data-raw.parquet")
dforig.filter(dforig.userHash=="f37640d943bf48a0ae42058733dd18e48d66f337").show()

+--------------------+--------------------+--------------------+-----------+
|            userHash|          artistMBID|          artistName|listenCount|
+--------------------+--------------------+--------------------+-----------+
|f37640d943bf48a0a...|eab76c9f-ff91-443...|   infected mushroom|      330.0|
|f37640d943bf48a0a...|82e454e2-38ee-4e6...|      chase & status|      323.0|
|f37640d943bf48a0a...|889b69e3-7b18-488...|       high contrast|      298.0|
|f37640d943bf48a0a...|b4afc42e-29d4-427...|            syncopix|      217.0|
|f37640d943bf48a0a...|84800c46-2211-43e...|           aphrodite|      214.0|
|f37640d943bf48a0a...|5e0ca89b-f5a6-403...|          stereo mcs|      206.0|
|f37640d943bf48a0a...|ba2f4f3b-0293-4bc...|          underworld|      193.0|
|f37640d943bf48a0a...|18234037-c0e5-4be...|bachelors of science|      168.0|
|f37640d943bf48a0a...|0356daee-ec48-449...|  london elektricity|      166.0|
|f37640d943bf48a0a...|32a509d0-6c4c-43c...|                  tc|      153.0|

In [4]:
users = spark.read.load("/data/lastfm-dataset-360K/users.parquet")
users.filter(users.userHash=="f37640d943bf48a0ae42058733dd18e48d66f337").show()
artists = spark.read.load("/data/lastfm-dataset-360K/artists.parquet")

+--------------------+------+
|            userHash|userId|
+--------------------+------+
|f37640d943bf48a0a...| 54113|
+--------------------+------+



In [5]:
from mlonspark import AlternatingLeastSquare
alg = AlternatingLeastSquare()\
    .setUserCol("userId")\
    .setItemCol("artists-artistId")\
    .setRatingCol("listenCount")\
    .setImplicitPrefs(True)

model = alg.fit(df)

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan
evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("listenCount")\
    .setPredictionCol("prediction")

predictions = model.transform(df)
predictionsFiltered = predictions.where(~isnan(col("prediction")))  
rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

RMSE = 650.775256


In [7]:
dfSingleUser = df.filter(df.userId==54113)

In [8]:
recommendations = model.recommendForUserSubset(dfSingleUser, 10)

In [9]:
recs = recommendations.rdd.flatMap(lambda v: v.recommendations).toDF()\
    .withColumnRenamed("artists-artistId", "artistId")

In [10]:
recsFinal = recs\
    .join(artists, recs.artistId==artists.artistId, 'inner')\
    .select(artists.artistName)

recsFinal.show()

+------------------+
|        artistName|
+------------------+
|     high contrast|
|              krec|
|london elektricity|
|          deadmau5|
|         aphrodite|
|         junkie xl|
|            hybrid|
|         logistics|
|     groove armada|
| nightmares on wax|
+------------------+

