In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '7')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

print('pyspark ready ...')

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [2]:
train = spark.read.load("/data/lastfm-dataset-360K/coo-data-train.parquet")
print(train.rdd.getNumPartitions())

train.printSchema()

7
root
 |-- userId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- listenCount: float (nullable = true)



In [3]:
%autoreload
from mlonspark.scaler import Scaler

scaler = Scaler().setGroupCol("userId").setInputCol("listenCount").setOutputCol("scaled")
train = scaler.fit(train).transform(train)

train.show(10)


+------+--------+-----------+---------+
|userId|artistId|listenCount|   scaled|
+------+--------+-----------+---------+
|   148|   90773|       52.0|1.1935484|
|   148|  219383|       29.0|1.0875576|
|   148|  289745|       17.0| 1.032258|
|   148|  151567|       14.0|1.0184332|
|   148|  215340|       14.0|1.0184332|
|   148|  152525|       26.0|1.0737327|
|   148|   17395|       15.0|1.0230415|
|   148|  108389|       13.0|1.0138249|
|   148|  108464|       11.0|1.0046083|
|   148|  155233|       17.0| 1.032258|
+------+--------+-----------+---------+
only showing top 10 rows



In [4]:
from pyspark.sql.functions import isnan

#nan = train.filter(isnan("listenCount"))
#nan.count()
train.agg(min("scaled"), max("scaled")).show()

+-----------+-----------+
|min(scaled)|max(scaled)|
+-----------+-----------+
|        1.0|        2.0|
+-----------+-----------+



In [5]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquare
from pyspark.ml.recommendation import ALS

als = AlternatingLeastSquare()\
    .setUserCol("userId")\
    .setItemCol("artistId")\
    .setRatingCol("scaled")\
    .setNumUserBlocks(7)\
    .setNumItemBlocks(7)\
    .setMaxIter(10)\

In [6]:
model = als.fit(train)
print(model)

AlternatingLeastSquare_4d98b163838babc1267a


In [7]:
test = spark.read.load("/data/lastfm-dataset-360K/coo-data-test.parquet")
print(test.rdd.getNumPartitions())

7


In [8]:
predictions = model.transform(test)
predictions.printSchema()
predictions.show(10)

root
 |-- userId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- listenCount: float (nullable = true)
 |-- prediction: float (nullable = false)

+------+--------+-----------+------------+
|userId|artistId|listenCount|  prediction|
+------+--------+-----------+------------+
| 66166|     148|      325.0|-8.043704E-6|
| 84435|     148|       99.0| 2.265907E-6|
| 74849|     148|      146.0|5.1805373E-6|
|117955|     148|       16.0|-9.346508E-6|
|328713|     148|       82.0|-4.837967E-6|
|310068|     148|      127.0|1.4713196E-6|
|133763|     148|       35.0|1.0321969E-5|
|179718|     148|      218.0|-9.583543E-6|
|292575|     148|       23.0|4.6273835E-5|
| 60056|     148|       71.0|-1.172958E-6|
+------+--------+-----------+------------+
only showing top 10 rows



In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("listenCount")\
    .setPredictionCol("prediction")

predictionsFiltered = predictions.where(~isnan(col("prediction")))                                
rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

RMSE = 662.300926
