# Movie Ratings Matrix Factorization (Collaborative Filtering)

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

## Spark Session

In [2]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

import matplotlib.pyplot as plt

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'MatrixFactorization'),
                                   ('spark.memory.offHeap.enabled', True),
                                   ('spark.memory.offHeap.size','4g'),
                                   ('spark.executor.memory', '4g'), 
                                   ('spark.driver.memory','6g')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-23 01:58:14,592 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sparkContext.setLogLevel("off")

## Load final ratings files

In [4]:
ratings_df = spark.read.csv("file:///home/work/data/ratings_100_max_wo_ohe.csv", inferSchema=True, header=True).repartition(100)
ratings_df.printSchema()



root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- genres: string (nullable = true)



                                                                                

In [5]:
ratings_df.count()

                                                                                

4678708

In [7]:
# train, test = ratings_df.randomSplit([0.8, 0.2], seed=0)
train, test = ratings_df.randomSplit([0.5, 0.5], seed=0)
print(f"train count: {train.count()} test count: {test.count()}")



train count: 2339367 test count: 2339341


                                                                                

In [8]:
#checking number of partitions 
# train.rdd.getNumPartitions()
ratings_df.unpersist()
test.unpersist()

DataFrame[userId: int, movieId: int, rating: double, genres: string]

## Building ALS model

### Alternating Least Squares (ALS) matrix factorization

In [12]:
#Alternating Least Squares (ALS) matrix factorization
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, #setting this to true since we are using ratings > 0.
          implicitPrefs=False, #setting this to false as we are using explicit ratings.
          coldStartStrategy='drop', # to make sure we don't get NaN evaluation metrics
          maxIter=10
)

### Hyperparameter Tuning

In [13]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder() \
                .addGrid(als.rank,[50, 75, 100, 125]) \
                .addGrid(als.regParam,[.1, .2, .3, .4]) \
                .build()

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')                                

In [15]:
# from pyspark.ml.tuning import CrossValidator
# cv = CrossValidator(estimator=als,
#                     estimatorParamMaps=param_grid,
#                     evaluator=evaluator,
#                     numFolds=10)
# cv.fit(test)
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=1, seed=0)


In [16]:
%%time
tvs_model = tvs.fit(train)



CPU times: user 1.3 s, sys: 392 ms, total: 1.69 s
Wall time: 34min 22s


                                                                                

In [19]:
best_model = tvs_model.bestModel

In [20]:
%%time
print("Test RMSE = ",evaluator.evaluate(best_model.transform(test)))



Test RMSE =  0.9091514461489086
CPU times: user 15.3 ms, sys: 1.87 ms, total: 17.2 ms
Wall time: 15.1 s


                                                                                

In [21]:
print("Train RMSE = ",evaluator.evaluate(best_model.transform(train)))



Train RMSE =  0.7920835132705324


                                                                                

In [22]:
model_path = "file:///home/work/data/als_model_v2.0"

In [23]:
write_files = True
if write_files:
    tvs_model.save(model_path)

                                                                                

In [24]:
from pyspark.ml.tuning import TrainValidationSplitModel
tvsModelRead = TrainValidationSplitModel.read().load(model_path)

In [25]:
tvsModelRead.validationMetrics

[0.9479053994041635,
 0.9221359702902987,
 0.9560366413532954,
 1.0015068616712193,
 0.9465487440341056,
 0.9223445542522011,
 0.9560000850943212,
 1.0013656163827065,
 0.9457746297001982,
 0.9225150075076441,
 0.9561453037570703,
 1.0015428256898005,
 0.9447042865116653,
 0.9224164127975071,
 0.956147839090789,
 1.001402419828829]

In [26]:
tvsModelRead.explainParams()

"estimator: estimator to be cross-validated (current: ALS_0a4f06b475f9)\nestimatorParamMaps: estimator param maps (current: [{Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.2, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.3, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.4, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='ALS_0a4f06b475f9', name='rank', doc=

In [27]:
best_model = tvsModelRead.bestModel

In [28]:
print("Best Model Train RMSE = ",evaluator.evaluate(best_model.transform(train)))



Best Model Train RMSE =  0.7920835132705324


                                                                                

In [29]:
%%time
print("Best Model Test RMSE = ",evaluator.evaluate(best_model.transform(test)))

[Stage 2310:>                                                       (0 + 4) / 4]

Best Model Test RMSE =  0.9091514461489084
CPU times: user 12.1 ms, sys: 1.19 ms, total: 13.3 ms
Wall time: 12 s


                                                                                

In [30]:
recommendations = best_model.recommendForAllUsers(5)
recommendations.show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{176403, 5.58552...|
|     6|[{176403, 5.84705...|
|    16|[{176403, 5.92769...|
|    22|[{7898, 6.203899}...|
|    26|[{7898, 5.0351887...|
|    27|[{176403, 5.16576...|
|    28|[{176403, 6.50009...|
|    34|[{176403, 5.19818...|
|    44|[{93022, 5.894025...|
|    47|[{176403, 5.22548...|
|    52|[{176403, 5.69629...|
|    53|[{176403, 6.53043...|
|    65|[{171207, 5.49963...|
|    78|[{176403, 6.19332...|
|    81|[{176403, 4.12929...|
|    85|[{7898, 4.817173}...|
|   103|[{176403, 5.90586...|
|   108|[{7898, 5.028264}...|
|   111|[{7898, 5.8993635...|
|   115|[{176403, 6.26260...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [31]:
top_5_recommendations = recommendations.withColumn('recommendation', explode('recommendations')) \
                .select('userId',col('recommendation.movieId'),col('recommendation.rating'))


In [32]:
#Recommendations
top_5_recommendations.join(test.select('movieId','genres'), on='movieId') \
                    .filter('userId = 1') \
                    .sort('rating', ascending = False).show(10)

                                                                                

+-------+------+---------+------+
|movieId|userId|   rating|genres|
+-------+------+---------+------+
| 176403|     1| 5.585524| Drama|
|   8332|     1| 5.082441| Drama|
|   8332|     1| 5.082441| Drama|
| 172455|     1|4.9383826|Comedy|
| 172455|     1|4.9383826|Comedy|
+-------+------+---------+------+



In [33]:
#Actual User Prefernces
test.select('userId','movieId','genres') \
    .filter('userId = 1') \
    .sort('rating', ascending = False).show(10)



+------+-------+--------------------+
|userId|movieId|              genres|
+------+-------+--------------------+
|     1|   6711|Comedy|Drama|Romance|
|     1|   3569|        Comedy|Drama|
|     1|   5767|        Comedy|Crime|
|     1|  32591|Comedy|Drama|Romance|
|     1|    307|               Drama|
|     1|   4325|               Drama|
|     1|   7361|Drama|Romance|Sci-Fi|
|     1|   2632|Adventure|Drama|M...|
|     1|   2351|               Drama|
|     1|   7940|               Drama|
+------+-------+--------------------+
only showing top 10 rows



                                                                                

In [None]:
spark.stop()

                                                                                