In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

print('pyspark ready ...')

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [2]:
train = spark.read.format("parquet").load("/data/books/ratings-train.parquet")
train.printSchema()

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)



In [31]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquare

als = AlternatingLeastSquare()\
    .setUserCol("userId")\
    .setItemCol("bookId")\
    .setRatingCol("rating")

print(als)

AlternatingLeastSquare_43d8ae8f361ae5a9859a


In [27]:
model = als.fit(train)
print(model)

AlternatingLeastSquare_49b98f48befb7be115c7


In [5]:
model.write().overwrite().save('/data/books/model.alsmodel')

In [6]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquareModel

model = AlternatingLeastSquareModel.read().load('/data/books/model.alsmodel')

In [7]:
print(model)

AlternatingLeastSquare_4fddabe90be03d11c617


In [8]:
test = spark.read.format("parquet").load("/data/books/ratings-test.parquet")
test.printSchema()

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)



In [9]:
predictions = model.transform(test)
predictions.printSchema()
predictions.show(10)

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- prediction: float (nullable = false)

+-------+----------+-----------+------+------+--------------------+---+------+------------+
|User-ID|      ISBN|Book-Rating|rating|bookId|            Location|Age|userId|  prediction|
+-------+----------+-----------+------+------+--------------------+---+------+------------+
| 100004|0061015725|          0|   0.0|  2430|san ysidro, calif...|  0|100004| 0.041647617|
| 100004|0345339738|          0|   0.0|  4206|san ysidro, calif...|  0|100004|  0.16113673|
| 100004|0345380371|          0|   0.0|  1860|san ysidro, calif...|  0|100004| 0.029794015|
| 100004|0440414806|          0|   0.0|  5649|san ysidro, calif...|  0|100004|  0.06666696|


In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("rating")\
    .setPredictionCol("prediction")

predictionsFiltered = predictions.where(~isnan(col("prediction")))
                                
print("number of predictions = %i" % predictionsFiltered.count())

rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

number of predictions = 279246
RMSE = 4.674371


In [32]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.recommendation import ALS
from mlonspark.alternating_least_square import AlternatingLeastSquare

als2 = ALS()\
    .setUserCol("userId")\
    .setItemCol("bookId")\
    .setRatingCol("rating")

pipeline = Pipeline().setStages([als])

params = ParamGridBuilder()\
    .addGrid(als.rank, [2, 5, 10])\
    .build()

cv = CrossValidator()\
    .setEstimator(als)\
    .setEstimatorParamMaps(params)\
    .setEvaluator(evaluator)\
    .setNumFolds(5)

model = cv.fit(train)

Py4JJavaError: An error occurred while calling o4417.copy.
: java.lang.NoSuchMethodException: mlonspark.AlternatingLeastSquareModel.<init>(java.lang.String)
	at java.lang.Class.getConstructor0(Class.java:3082)
	at java.lang.Class.getConstructor(Class.java:1825)
	at org.apache.spark.ml.param.Params$class.defaultCopy(params.scala:846)
	at org.apache.spark.ml.PipelineStage.defaultCopy(Pipeline.scala:43)
	at mlonspark.AlternatingLeastSquareModel.copy(AlternatingLeastSquare.scala:170)
	at mlonspark.AlternatingLeastSquareModel.copy(AlternatingLeastSquare.scala:158)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [29]:
model.bestModel
model.avgMetrics

[4.655923867162846]

In [13]:
model.write().overwrite().save('/data/books/crossvalidation.model')