In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ALS').getOrCreate()

In [2]:
!head ../data/ratings.csv

1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151
1,1287,2.0,1260759187
1,1293,2.0,1260759148
1,1339,3.5,1260759125
1,1343,2.0,1260759131


In [3]:
from pyspark.sql import Row

ratings_rdd = spark.sparkContext \
                   .textFile('../data/ratings.csv') \
                   .map(lambda line: line.split(',')) \
                   .map(lambda values: Row(user_id=int(values[0]), 
                                           movie_id=int(values[1]), 
                                           rating=float(values[2])))

ratings_df = spark.createDataFrame(ratings_rdd)
ratings_df.show(5)

+--------+------+-------+
|movie_id|rating|user_id|
+--------+------+-------+
|      31|   2.5|      1|
|    1029|   3.0|      1|
|    1061|   3.0|      1|
|    1129|   2.0|      1|
|    1172|   4.0|      1|
+--------+------+-------+
only showing top 5 rows



In [4]:
from pyspark.ml.recommendation import ALS

(training, test) = ratings_df.randomSplit([0.8, 0.2])

# TODO: train ALS model with rank=8, maxIter=10 
# and nonnegative=True

als = ALS(rank=8, 
          maxIter=10, 
          userCol='user_id',
          itemCol='movie_id')

model = als.fit(training)

In [6]:
model.userFactors.show(5, truncate=False)

+---+--------------------------------------------------------------------------------------------------+
|id |features                                                                                          |
+---+--------------------------------------------------------------------------------------------------+
|10 |[0.29715133, -0.67561156, 1.4933176, 0.09223974, 0.30582935, 0.53732026, 0.64945066, 0.50541675]  |
|20 |[0.01752448, -1.1346962, 0.5675352, -0.6137143, 0.044949386, 1.3254719, 0.85583174, -0.5330784]   |
|30 |[-0.18940143, -1.3753406, 0.7915036, -0.36732832, -0.30896997, 0.66650254, 0.16912146, 1.0029738] |
|40 |[0.51354086, -0.62774503, 1.2171535, -0.5378062, -0.50973356, 0.917853, -0.027028423, 0.7851973]  |
|50 |[-0.23169115, -0.88650054, 1.1549692, -0.45893127, 0.064756595, 0.62241155, 0.2911138, 0.38844243]|
+---+--------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

# TODO: make predictions for test data

pred = model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')

# TODO: evaluate predictions with RegressionEvaluator
# NOTE: remove records with isnan('prediction') == True

In [8]:
evaluator.evaluate(pred.filter(~isnan('prediction')))

0.9121674395309523