# Movie Ratings Matrix Factorization (Collaborative Filtering)

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
#env variables
write_files = False

## Spark Session

In [3]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

import matplotlib.pyplot as plt

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'MatrixFactorization'),
                                   ('spark.memory.offHeap.enabled', True),
                                   ('spark.memory.offHeap.size','4g'),
                                   ('spark.executor.memory', '4g'), 
                                   ('spark.driver.memory','6g')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-23 04:49:26,046 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-05-23 04:49:27,644 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
spark.sparkContext.setLogLevel("off")

In [5]:
spark.version

'3.2.1'

## Load final ratings files

In [6]:
# ratings_df = spark.read.csv("file:///home/work/data/ratings_100_max.csv", inferSchema=True, header=True) \
#                         .select('userId','movieId','rating','genres')
# ratings_df.printSchema()

In [7]:
#ratings_df.write.csv("file:///home/work/data/ratings_100_max_wo_ohe.csv", header=True)

In [8]:
ratings_df = spark.read.csv("file:///home/work/data/ratings_100_max_wo_ohe.csv", inferSchema=True, header=True)

                                                                                

In [9]:
# train, test = ratings_df.randomSplit([0.8, 0.2], seed=0)
train, test = ratings_df.randomSplit([0.5, 0.5], seed=0)

In [10]:
#checking number of partitions 
# train.rdd.getNumPartitions()
#ratings_df.unpersist()
test.unpersist()

DataFrame[userId: int, movieId: int, rating: double, genres: string]

## Building ALS model

### Alternating Least Squares (ALS) matrix factorization

In [11]:
#Alternating Least Squares (ALS) matrix factorization
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, #setting this to true since we are using ratings > 0.
          implicitPrefs=False, #setting this to false as we are using explicit ratings.
          coldStartStrategy='drop', # to make sure we don't get NaN evaluation metrics
)

### Hyperparameter Tuning

In [12]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder() \
                .addGrid(als.rank,[50, 75, 100, 125]) \
                .addGrid(als.regParam,[.1, .2, .3, .4]) \
                .build()

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')                                

In [14]:
# from pyspark.ml.tuning import CrossValidator
# cv = CrossValidator(estimator=als,
#                     estimatorParamMaps=param_grid,
#                     evaluator=evaluator,
#                     numFolds=10)
# cv.fit(test)
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=1, seed=0)


In [15]:
%%time
#tvs_model = tvs.fit(train)

CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 40.8 µs


In [16]:
model_path = "file:///home/work/data/als_model_v2.0"

In [17]:
if write_files:
    tvs_model.save(model_path)

In [18]:
from pyspark.ml.tuning import TrainValidationSplitModel
tvsModelRead = TrainValidationSplitModel.read().load(model_path)

                                                                                

In [19]:
tvsModelRead.validationMetrics

[0.9479053994041635,
 0.9221359702902987,
 0.9560366413532954,
 1.0015068616712193,
 0.9465487440341056,
 0.9223445542522011,
 0.9560000850943212,
 1.0013656163827065,
 0.9457746297001982,
 0.9225150075076441,
 0.9561453037570703,
 1.0015428256898005,
 0.9447042865116653,
 0.9224164127975071,
 0.956147839090789,
 1.001402419828829]

In [20]:
tvsModelRead.explainParams()

"estimator: estimator to be cross-validated (current: ALS_0a4f06b475f9)\nestimatorParamMaps: estimator param maps (current: [{Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.2, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.3, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.4, Param(parent='ALS_0a4f06b475f9', name='rank', doc='rank of the factorization'): 50}, {Param(parent='ALS_0a4f06b475f9', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='ALS_0a4f06b475f9', name='rank', doc=

In [21]:
best_model = tvsModelRead.bestModel

In [22]:
print("Best Model Train RMSE = ",evaluator.evaluate(best_model.transform(train)))



Best Model Train RMSE =  0.8527373346517281


                                                                                

In [23]:
%%time
print("Best Model Test RMSE = ",evaluator.evaluate(best_model.transform(test)))



Best Model Test RMSE =  0.8524302570365064
CPU times: user 31 ms, sys: 13 ms, total: 44 ms
Wall time: 19.2 s


                                                                                

In [24]:
recommendations = best_model.recommendForAllUsers(5)
recommendations.show(10,truncate=False)



+------+------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                       |
+------+------------------------------------------------------------------------------------------------------+
|1     |[{176403, 5.585524}, {7898, 5.184587}, {8332, 5.082441}, {68831, 4.957301}, {172455, 4.9383826}]      |
|6     |[{176403, 5.84705}, {7898, 5.7286277}, {192391, 5.521527}, {172455, 5.4141035}, {72557, 5.2496595}]   |
|16    |[{176403, 5.9276967}, {7898, 5.6642647}, {49265, 5.2970176}, {172455, 5.2871194}, {207311, 5.2752876}]|
|22    |[{7898, 6.203899}, {176403, 5.888308}, {172455, 5.884922}, {26167, 5.7084074}, {118760, 5.670126}]    |
|26    |[{7898, 5.0351887}, {176403, 4.911462}, {172455, 4.746614}, {99841, 4.554283}, {2257, 4.517469}]      |
|27    |[{176403, 5.1657653}, {118760, 5.085511}, {7898, 5.0501847}, {8761, 4.992553}, {134579, 4.900047

                                                                                

In [25]:
top_5_recommendations = recommendations.withColumn('recommendation', explode('recommendations')) \
                .select('userId',col('recommendation.movieId').alias('rec_movie_id'),col('recommendation.rating').alias('rec_rating'))


In [26]:
top_5_recommendations.select('rec_rating').describe().toPandas()

                                                                                

Unnamed: 0,summary,rec_rating
0,count,497195.0
1,mean,5.058202126536634
2,stddev,0.6114788883792666
3,min,0.6190477
4,max,7.513843


Fetching groups of user id and top recommended rating

In [27]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
windowDept = Window.partitionBy("userId").orderBy(col("rec_rating").desc())
top_5_recommendations.withColumn("row",row_number().over(windowDept)) \
  .filter(col("row") == 1).drop("row") \
  .sort('rec_rating') \
  .show()

[Stage 38:>                                                         (0 + 4) / 4]

+------+------------+----------+
|userId|rec_movie_id|rec_rating|
+------+------------+----------+
| 86408|      176403|0.70916986|
| 13677|        7898| 0.7104447|
|122697|      176403| 0.7112727|
| 61102|        7898| 0.7116633|
| 55553|      176403|0.71313316|
|124552|      176403|0.72123104|
| 95542|      176403| 0.7220734|
|  5821|      176403|0.72314626|
| 63044|        7898| 0.7246245|
|113767|      176403| 0.7258566|
| 94280|        7898|0.72810245|
| 60674|      176403|0.73048496|
| 38998|      176403|0.74399805|
|121218|      176403| 0.7501817|
| 10364|      176403|0.75109565|
|  9152|      176403| 0.7574091|
| 37091|      176403| 0.7816114|
| 15547|        7898|0.92563355|
| 66805|      176403|   1.02988|
|138720|      176403| 1.1151667|
+------+------------+----------+
only showing top 20 rows



                                                                                

Fetching max recommended rating !!

In [28]:
max_rec_rating = top_5_recommendations.select(max('rec_rating')).collect()[0][0]
max_rec_rating

                                                                                

7.513843059539795

Finding the user with max recommended rating

In [29]:
top_5_recommendations.filter(col('rec_rating')==max_rec_rating).toPandas()

                                                                                

Unnamed: 0,userId,rec_movie_id,rec_rating
0,66365,176403,7.513843


In [30]:
user_id_with_max_rec_rating = top_5_recommendations.filter(col('rec_rating')==max_rec_rating).collect()[0][0]
print("User ID with max recommendation rating: ", user_id_with_max_rec_rating)



User ID with max recommendation rating:  66365


                                                                                

Refering movies file to get the titles

In [31]:
movies_df = spark.read.option("header",True).csv("file:///home/work/data/cleaned_movies.csv", inferSchema=True)

Getting top 5 recommended movies for User ID with max recommendation rating

In [32]:
#Recommendations
top_5_recommendations.join(ratings_df.select('movieId','genres'), top_5_recommendations.rec_movie_id ==  ratings_df.movieId) \
                    .distinct() \
                    .join(movies_df.select('movieId','title'), on='movieId') \
                    .filter(col('userId') ==  user_id_with_max_rec_rating ) \
                    .sort('rec_rating', ascending = False).show(10, truncate=False)

                                                                                

+-------+------+------------+----------+--------------------+------------------------+
|movieId|userId|rec_movie_id|rec_rating|genres              |title                   |
+-------+------+------------+----------+--------------------+------------------------+
|176403 |66365 |176403      |7.513843  |Drama               |The Divine Order (2017) |
|7898   |66365 |7898        |7.4206233 |Comedy|Drama|Western|Junior Bonner (1972)    |
|172455 |66365 |172455      |7.154274  |Comedy              |Beatriz at Dinner (2017)|
|118760 |66365 |118760      |6.964354  |Drama               |The Good Lie (2014)     |
|8013   |66365 |8013        |6.882248  |Comedy|Romance      |I'm with Lucy (2002)    |
+-------+------+------------+----------+--------------------+------------------------+



Getting all the movies rated highly by User ID with max recommendation rating

In [34]:
#Actual User Prefernces
test.select('userId','movieId','genres','rating') \
     .distinct() \
     .join(movies_df.select('movieId','title'), on='movieId') \
     .filter(col('userId') ==  user_id_with_max_rec_rating) \
     .sort('rating', ascending = False) \
     .show(10, truncate=False)



+-------+------+------------------------------------+------+-------------------------------------+
|movieId|userId|genres                              |rating|title                                |
+-------+------+------------------------------------+------+-------------------------------------+
|196    |66365 |Horror|Sci-Fi                       |5.0   |Species (1995)                       |
|552    |66365 |Action|Adventure|Comedy|Romance     |5.0   |Three Musketeers, The (1993)         |
|1909   |66365 |Action|Crime|Mystery|Sci-Fi|Thriller|5.0   |X-Files: Fight the Future, The (1998)|
|158    |66365 |Adventure|Children                  |5.0   |Casper (1995)                        |
|5      |66365 |Comedy                              |5.0   |Father of the Bride Part II (1995)   |
|2763   |66365 |Action|Mystery                      |5.0   |Thomas Crown Affair, The (1999)      |
|3175   |66365 |Adventure|Comedy|Sci-Fi             |5.0   |Galaxy Quest (1999)                  |
|3176   |6

                                                                                

In [35]:
spark.stop()