In [3]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '7')\
    .getOrCreate()

print('pyspark ready ...')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [4]:
train = spark.read.load("/data/lastfm-dataset-360K/data-filtered-std-pos-train.parquet")
test = spark.read.load("/data/lastfm-dataset-360K/data-filtered-std-pos-test.parquet")

In [5]:
from pyspark.ml.recommendation import ALS
alg = ALS()\
    .setUserCol("userId")\
    .setItemCol("artistId")\
    .setRatingCol("stdCountPos")\
    .setImplicitPrefs(True)

model = alg.fit(train)

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan
evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("stdCountPos")\
    .setPredictionCol("prediction")

trainPredictions = model.transform(train)
trainPredictionsFiltered = trainPredictions.where(~isnan(col("prediction")))  
trainRmse = evaluator.evaluate(trainPredictionsFiltered)

testPredictions = model.transform(test)
testPredictionsFiltered = testPredictions.where(~isnan(col("prediction")))    
testRmse = evaluator.evaluate(testPredictionsFiltered)

print("train RMSE = %f" % trainRmse)
print("test RMSE = %f" % testRmse)

train RMSE = 1.297908
test RMSE = 1.310257
