In [1]:
sc

In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
ratings = spark.read.text("file:///home/ubuntu/Spark-The-Definitive-Guide/data/sample_movielens_ratings.txt")\
  .rdd.toDF()\
  .selectExpr("split(value , '::') as col")\
  .selectExpr(
    "cast(col[0] as int) as userId",
    "cast(col[1] as int) as movieId",
    "cast(col[2] as float) as rating",
    "cast(col[3] as long) as timestamp")
training, test = ratings.randomSplit([0.8, 0.2])
als = ALS()\
  .setMaxIter(5)\
  .setRegParam(0.01)\
  .setUserCol("userId")\
  .setItemCol("movieId")\
  .setRatingCol("rating")
print(als.explainParams())
alsModel = als.fit(training)
predictions = alsModel.transform(test)

alpha: alpha for implicit preference (default: 1.0)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
coldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'. (default: nan)
finalStorageLevel: StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)
implicitPrefs: whether to use implicit preference (default: False)
intermediateStorageLevel: StorageLevel for intermediate datasets. Cannot be 'NONE'. (default: MEMORY_AND_DISK)
itemCol: column name for item ids. Ids must be within the integer value range. (default: item, current: movieId)
maxIter: max number of iterations (>= 0

In [3]:
alsModel.recommendForAllUsers(10)\
  .selectExpr("userId", "explode(recommendations)").show()
alsModel.recommendForAllItems(10)\
  .selectExpr("movieId", "explode(recommendations)").show()

+------+---------------+
|userId|            col|
+------+---------------+
|    28| [92, 5.123827]|
|    28|[46, 4.8833604]|
|    28| [12, 4.716059]|
|    28|[81, 4.4394255]|
|    28|[89, 4.1382756]|
|    28|[82, 4.0096335]|
|    28|[49, 3.8630824]|
|    28|  [2, 3.733239]|
|    28|[22, 3.6181092]|
|    28|[38, 3.3592396]|
|    26| [12, 7.031254]|
|    26|[34, 6.9927745]|
|    26|[87, 6.9834156]|
|    26|[19, 6.5419154]|
|    26| [32, 5.562295]|
|    26| [7, 5.2005177]|
|    26|[23, 5.0199385]|
|    26|  [88, 5.00247]|
|    26|[80, 4.9992037]|
|    26| [92, 4.986472]|
+------+---------------+
only showing top 20 rows

+-------+---------------+
|movieId|            col|
+-------+---------------+
|     31|[17, 3.9742641]|
|     31| [12, 3.776509]|
|     31| [8, 3.0678513]|
|     31| [9, 2.9888442]|
|     31|[14, 2.8119066]|
|     31| [23, 2.489113]|
|     31|[21, 2.4762306]|
|     31| [6, 2.2760584]|
|     31| [2, 1.9432878]|
|     31|[25, 1.7884777]|
|     85| [8, 4.5071225]|
|     85| 

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()\
  .setMetricName("rmse")\
  .setLabelCol("rating")\
  .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
print(("Root-mean-square error = %f" % rmse))

Root-mean-square error = 1.863871


In [5]:
from pyspark.mllib.evaluation import RegressionMetrics
regComparison = predictions.select("rating", "prediction")\
  .rdd.map(lambda x: (x(0), x(1)))
metrics = RegressionMetrics(regComparison)

In [6]:
from pyspark.mllib.evaluation import RankingMetrics, RegressionMetrics
from pyspark.sql.functions import col, expr
perUserActual = predictions\
  .where("rating > 2.5")\
  .groupBy("userId")\
  .agg(expr("collect_set(movieId) as movies"))

In [7]:
perUserPredictions = predictions\
  .orderBy(col("userId"), expr("prediction DESC"))\
  .groupBy("userId")\
  .agg(expr("collect_list(movieId) as movies"))

In [8]:
perUserActualvPred = perUserActual.join(perUserPredictions, ["userId"]).rdd\
  .map(lambda row: (row[1], row[2][:15]))
ranks = RankingMetrics(perUserActualvPred)

In [9]:
ranks.meanAveragePrecision
ranks.precisionAt(5)

0.5076923076923078