In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ALS').getOrCreate()

In [2]:
!head ../data/ratings.csv

1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151
1,1287,2.0,1260759187
1,1293,2.0,1260759148
1,1339,3.5,1260759125
1,1343,2.0,1260759131


In [3]:
from pyspark.sql import Row

ratings_rdd = spark.sparkContext \
                   .textFile('../data/ratings.csv') \
                   .map(lambda line: line.split(',')) \
                   .map(lambda values: Row(user_id=int(values[0]), 
                                           movie_id=int(values[1]), 
                                           rating=float(values[2])))

ratings_df = spark.createDataFrame(ratings_rdd)
ratings_df.show()

+--------+------+-------+
|movie_id|rating|user_id|
+--------+------+-------+
|      31|   2.5|      1|
|    1029|   3.0|      1|
|    1061|   3.0|      1|
|    1129|   2.0|      1|
|    1172|   4.0|      1|
|    1263|   2.0|      1|
|    1287|   2.0|      1|
|    1293|   2.0|      1|
|    1339|   3.5|      1|
|    1343|   2.0|      1|
|    1371|   2.5|      1|
|    1405|   1.0|      1|
|    1953|   4.0|      1|
|    2105|   4.0|      1|
|    2150|   3.0|      1|
|    2193|   2.0|      1|
|    2294|   2.0|      1|
|    2455|   2.5|      1|
|    2968|   1.0|      1|
|    3671|   3.0|      1|
+--------+------+-------+
only showing top 20 rows



In [9]:
from pyspark.ml.recommendation import ALS

(training, test) = ratings_df.randomSplit([0.8, 0.2])

# TODO: train ALS model with rank=8, maxIter=10 
# and nonnegative=True

als = ALS(rank=8, 
          maxIter=10, 
          userCol='user_id',
          itemCol='movie_id')

model = als.fit(training)

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

# TODO: make predictions for test data

pred = model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')


# TODO: evaluate predictions with RegressionEvaluator
# NOTE: remove records with isnan('prediction') == True

In [14]:
evaluator.evaluate(pred.filter(~isnan('prediction')))

0.9168141598305382

In [15]:
pred_filtered = pred.filter(~isnan('prediction'))

In [18]:
pred_filtered.show()

+--------+------+-------+----------+
|movie_id|rating|user_id|prediction|
+--------+------+-------+----------+
|     463|   2.0|    452| 2.8102794|
|     471|   3.0|    588| 3.6835876|
|     471|   5.0|    126| 3.7061648|
|     471|   3.0|    440| 3.2982986|
|     471|   3.0|    306| 3.6193082|
|     471|   3.0|    452| 3.6841536|
|     471|   4.5|    299|  4.564473|
|     471|   4.0|    585| 4.0434856|
|     471|   5.0|    399| 3.6553977|
|     471|   4.0|    468| 3.3675385|
|     496|   2.0|    497| 2.3909516|
|     496|   3.0|    509|  2.233948|
|     833|   4.5|    296| 2.1368804|
|    1088|   3.0|    580| 2.6765797|
|    1088|   4.0|    372| 3.5060534|
|    1088|   4.0|    500|  3.418509|
|    1088|   2.0|    262| 1.5738536|
|    1088|   3.0|    358| 2.9140196|
|    1088|   4.0|    160| 4.2165084|
|    1088|   3.0|    213| 2.3162425|
+--------+------+-------+----------+
only showing top 20 rows



In [19]:
import numpy as np
from sklearn.metrics import classification_report
y_pred = np.array(pred_filtered.select(['prediction'])\
                  .collect())
y_target = np.array(pred_filtered.select(['rating'])\
                    .collect())

print(classification_report(np.round(y_target),
                            np.round(y_pred)))

             precision    recall  f1-score   support

        0.0       0.10      0.02      0.03       206
        1.0       0.20      0.09      0.13       601
        2.0       0.37      0.29      0.32      2466
        3.0       0.30      0.58      0.39      3938
        4.0       0.59      0.56      0.57      9169
        5.0       0.56      0.17      0.26      2959
        6.0       0.00      0.00      0.00         0

avg / total       0.48      0.45      0.44     19339



  'recall', 'true', average, warn_for)


In [11]:
pred.show()

+--------+------+-------+----------+
|movie_id|rating|user_id|prediction|
+--------+------+-------+----------+
|     148|   4.0|    575|       NaN|
|     463|   2.0|    452| 2.8102794|
|     471|   3.0|    588| 3.6835876|
|     471|   5.0|    126| 3.7061648|
|     471|   3.0|    440| 3.2982986|
|     471|   3.0|    306| 3.6193082|
|     471|   3.0|    452| 3.6841536|
|     471|   4.5|    299|  4.564473|
|     471|   4.0|    585| 4.0434856|
|     471|   5.0|    399| 3.6553977|
|     471|   4.0|    468| 3.3675385|
|     496|   2.0|    497| 2.3909516|
|     496|   3.0|    509|  2.233948|
|     833|   4.5|    296| 2.1368804|
|    1088|   3.0|    580| 2.6765797|
|    1088|   4.0|    372| 3.5060534|
|    1088|   4.0|    500|  3.418509|
|    1088|   2.0|    262| 1.5738536|
|    1088|   3.0|    358| 2.9140196|
|    1088|   4.0|    160| 4.2165084|
+--------+------+-------+----------+
only showing top 20 rows

