In [4]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '6')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

#   
    
print('pyspark ready ...')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [5]:
train = spark.read.format("parquet").load("/data/books/ratings-train.parquet")
train.printSchema()

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)



In [6]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquare
from pyspark.ml.recommendation import ALS
#from mlonspark.alternating_least_square import XALS

als = AlternatingLeastSquare()\
    .setUserCol("userId")\
    .setItemCol("bookId")\
    .setRatingCol("rating")\
    .setMaxIter(10)

#\
#    .setImplicitPrefs(True)

print(als)

AlternatingLeastSquare_4fc384839c675b4b1dd9


In [7]:
model = als.fit(train)
print(model)

AlternatingLeastSquare_4fc384839c675b4b1dd9


In [8]:
model.write().overwrite().save('/data/books/model.alsmodel')

In [9]:
%autoreload
#from mlonspark.alternating_least_square import AlternatingLeastSquareModel

#model = AlternatingLeastSquareModel.read().load('/data/books/model.alsmodel')

In [10]:
print(model)

AlternatingLeastSquare_4fc384839c675b4b1dd9


In [11]:
test = spark.read.format("parquet").load("/data/books/ratings-test.parquet")
test.printSchema()

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)



In [12]:
predictions = model.transform(test)
predictions.printSchema()
predictions.show(10)

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- prediction: float (nullable = false)

+-------+----------+-----------+------+------+--------------------+----+------+-----------+
|User-ID|      ISBN|Book-Rating|rating|bookId|            Location| Age|userId| prediction|
+-------+----------+-----------+------+------+--------------------+----+------+-----------+
| 127429|0380778556|          0|   1.0|   148|kansas city, miss...|  23|127429| 0.09679905|
| 250764|0380778556|         10|   4.0|   148|   cove, oregon, usa|NULL|250764|0.021571795|
|  76352|0380778556|          0|   1.0|   148|olympia, washingt...|  58| 76352| 0.06522451|
| 264317|0380778556|          0|   1.0|   148|portela de sacav�...|  25|264317|0.102816366|


In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("rating")\
    .setPredictionCol("prediction")

predictionsFiltered = predictions.where(~isnan(col("prediction")))
                                
print("number of predictions = %i" % predictionsFiltered.count())

rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

number of predictions = 279866
RMSE = 1.615744


In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.recommendation import ALS

#als2 = XALS()\
#    .setUserCol("userId")\
#    .setItemCol("bookId")\
#    .setRatingCol("rating")

pipeline = Pipeline().setStages([als])

params = ParamGridBuilder()\
    .addGrid(als.rank, range(6, 24, 2))\
    .build()
#    .addGrid(als.rank, [2, 5, 10])\
    

cv = CrossValidator()\
    .setEstimator(als)\
    .setEstimatorParamMaps(params)\
    .setEvaluator(evaluator)\
    .setNumFolds(5)

model = cv.fit(train)

In [15]:
model.bestModel
model.avgMetrics

[1.6239172917349451,
 1.6222531662774091,
 1.6215336166588599,
 1.6210818098192958,
 1.6206428002718851,
 1.6203356037407715,
 1.6200454984194144,
 1.61983906159896,
 1.6197185058577894]

In [16]:
bestModel = model.bestModel

bestModel.write().overwrite().save('/data/books/bestModel-rank.model')

In [17]:
predictionsBestModel = bestModel.transform(test)
bestModelRmse = evaluator.evaluate(predictionsBestModel)

print("RMSE = %f" % bestModelRmse)

RMSE = 1.612789


In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("listenCount")\
    .setPredictionCol("prediction")

predictionsFiltered = predictions.where(~isnan(col("prediction")))                                
rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

ModuleNotFoundError: No module named 'pyspark'