# Movie Ratings Matrix Factorization (Collaborative Filtering)

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
#env variables
write_files = False
fit_model = False
model_path = "file:///home/work/data/als_model_v3.0"

## Spark Session

In [3]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

import matplotlib.pyplot as plt

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'MatrixFactorization'),
                                   ('spark.memory.offHeap.enabled', True),
                                   ('spark.memory.offHeap.size','4g'),
                                   ('spark.executor.memory', '4g'), 
                                   ('spark.driver.memory','6g')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-24 00:43:22,099 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark.sparkContext.setLogLevel("off")

In [5]:
spark.version

'3.2.1'

## Load final ratings files

In [6]:
# ratings_df = spark.read.csv("file:///home/work/data/ratings_100_max.csv", inferSchema=True, header=True) \
#                         .select('userId','movieId','rating','genres')
# ratings_df.printSchema()

In [7]:
#ratings_df.write.csv("file:///home/work/data/ratings_100_max_wo_ohe.csv", header=True)

In [8]:
train = spark.read.csv("file:///home/work/data/ratings_train.csv", inferSchema=True, header=True)

                                                                                

## Building ALS model

### Alternating Least Squares (ALS) matrix factorization

In [9]:
#Alternating Least Squares (ALS) matrix factorization
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, #setting this to true since we are using ratings > 0.
          implicitPrefs=False, #setting this to false as we are using explicit ratings.
          coldStartStrategy='drop', # to make sure we don't get NaN evaluation metrics
)

### Hyperparameter Tuning

In [10]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder() \
                .addGrid(als.rank,[50, 75, 100, 125]) \
                .addGrid(als.regParam,[.1, .2, .3, .4]) \
                .build()

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')                                

In [12]:
# from pyspark.ml.tuning import CrossValidator
# cv = CrossValidator(estimator=als,
#                     estimatorParamMaps=param_grid,
#                     evaluator=evaluator,
#                     numFolds=10)
# cv.fit(test)
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=1, seed=0)


In [13]:
%%time
if fit_model:
    tvs_model = tvs.fit(train)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 7.87 µs


In [14]:
if write_files:
    tvs_model.save(model_path)

In [15]:
from pyspark.ml.tuning import TrainValidationSplitModel
tvsModelRead = TrainValidationSplitModel.read().load(model_path)
# train = spark.read.csv("file:///home/work/data/ratings_train.csv", inferSchema=True, header=True)
test = spark.read.csv("file:///home/work/data/ratings_test.csv", inferSchema=True, header=True)

                                                                                

In [16]:
movies_df = spark.read.option("header",True).csv("file:///home/work/data/cleaned_movies.csv", inferSchema=True)

In [17]:
tvsModelRead.validationMetrics

[0.8905777707351172,
 0.9010232162538442,
 0.9454123025737626,
 0.9913658875456827,
 0.8896598583663039,
 0.9012318321097895,
 0.9456857052263995,
 0.9915404477757173,
 0.8898603884721867,
 0.9015462862606608,
 0.9458674191100008,
 0.991629804403516,
 0.8892534517948678,
 0.90146664920257,
 0.9458295484781136,
 0.9915637274683925]

In [18]:
tvsModelRead.explainParams()

"estimator: estimator to be cross-validated (current: ALS_b6e83d8d5f39)\nestimatorParamMaps: estimator param maps (current: [{Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.1}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.2}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.3}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.4}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 75, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regular

In [19]:
best_model = tvsModelRead.bestModel

In [20]:
print("Best Model Train RMSE = ",evaluator.evaluate(best_model.transform(train)))



Best Model Train RMSE =  0.6290511606387795


                                                                                

In [21]:
%%time
print("Best Model Test RMSE = ",evaluator.evaluate(best_model.transform(test)))



Best Model Test RMSE =  0.8647913965554024
CPU times: user 14.3 ms, sys: 1.05 ms, total: 15.4 ms
Wall time: 27.9 s


                                                                                

In [22]:
recommendations = best_model.recommendForAllUsers(5)
recommendations.show(10,truncate=False)



+------+--------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                         |
+------+--------------------------------------------------------------------------------------------------------+
|1     |[{26520, 4.9192705}, {2632, 4.610244}, {8629, 4.5221386}, {8377, 4.4997187}, {7096, 4.4996743}]         |
|6     |[{25941, 5.2045436}, {5911, 4.9134603}, {31086, 4.8424063}, {146327, 4.8311415}, {8377, 4.817455}]      |
|16    |[{163512, 5.1467366}, {170425, 5.1442766}, {25941, 5.127475}, {72850, 5.077169}, {152842, 5.048527}]    |
|22    |[{118760, 5.2957597}, {36799, 5.0936933}, {8377, 5.0924625}, {170777, 5.0544353}, {6411, 5.034553}]     |
|26    |[{163512, 4.3564367}, {170425, 4.3407555}, {163112, 4.2501783}, {32705, 4.2499933}, {179795, 4.2499933}]|
|27    |[{185571, 4.915963}, {96991, 4.8671374}, {113315, 4.839786}, {118760, 4.821211},

                                                                                

In [23]:
top_5_recommendations = recommendations.withColumn('recommendation', explode('recommendations')) \
                .select('userId',col('recommendation.movieId').alias('rec_movie_id'),col('recommendation.rating').alias('rec_rating'))


In [24]:
top_5_recommendations.select('rec_rating').describe().toPandas()

                                                                                

Unnamed: 0,summary,rec_rating
0,count,554385.0
1,mean,4.620933781641176
2,stddev,0.4763044881822583
3,min,0.5628935
4,max,6.4984093


Fetching groups of user id and top recommended rating

In [25]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
windowDept = Window.partitionBy("userId").orderBy(col("rec_rating").desc())
top_5_recommendations.withColumn("row",row_number().over(windowDept)) \
  .filter(col("row") == 1).drop("row") \
  .sort('rec_rating') \
  .show()

[Stage 42:>                                                         (0 + 4) / 4]

+------+------------+----------+
|userId|rec_movie_id|rec_rating|
+------+------------+----------+
| 86408|      163512| 0.5783854|
|113767|      118760| 0.5990427|
| 13677|      118760| 0.6035718|
|122697|       25941| 0.6085643|
|  9152|      118760|0.60947174|
|  5821|      163512|0.61109954|
|121218|      163512|0.61887234|
| 61102|      118760| 0.6210604|
| 63044|      118760|0.62464553|
| 38998|      170425| 0.6257034|
| 60674|      118760| 0.6270056|
|124552|      118760| 0.6310857|
| 94280|      118760|0.63461524|
| 10364|        8377|0.63687766|
| 37091|         914| 0.7694206|
| 99438|         273| 0.9350204|
|138720|       93432| 1.0294752|
|121401|         223| 1.0479981|
| 15547|        1093| 1.0568098|
| 50119|        2826| 1.0912154|
+------+------------+----------+
only showing top 20 rows



                                                                                

Fetching max recommended rating !!

In [26]:
max_rec_rating = top_5_recommendations.select(max('rec_rating')).collect()[0][0]
min_rec_rating = top_5_recommendations.select(min('rec_rating')).collect()[0][0]
max_rec_rating

                                                                                

6.498409271240234

Finding the user with max recommended rating

In [27]:
top_5_recommendations.filter(col('rec_rating')==max_rec_rating).toPandas()

                                                                                

Unnamed: 0,userId,rec_movie_id,rec_rating
0,18230,118760,6.498409


In [28]:
user_id_with_max_rec_rating = top_5_recommendations.filter(col('rec_rating')==max_rec_rating).collect()[0][0]
print("User ID with max recommendation rating: ", user_id_with_max_rec_rating)
user_id_with_min_rec_rating = top_5_recommendations.filter(col('rec_rating')==min_rec_rating).collect()[0][0]
print("User ID with min recommendation rating: ", user_id_with_min_rec_rating)

                                                                                

User ID with max recommendation rating:  18230




User ID with min recommendation rating:  86408


                                                                                

In [29]:
def print_reccomendations(u_id, recs, train, test, movies_df):
    print(f"user {u_id} top preferences:")
    recs.filter(col('userId')==u_id) \
                    .join(movies_df.select('movieId','title', 'genres'), recs.rec_movie_id ==  movies_df.movieId).show(truncate=False)
    print(f"user {u_id} train data:")
    train.select('userId','movieId','rating').filter(col('userId')==u_id) \
                .join(movies_df.select('movieId','title', 'genres'), on='movieId').show(truncate=False)
    print(f"user {u_id} test data:")
    test.select('userId','movieId','rating').filter(col('userId')==u_id) \
                .join(movies_df.select('movieId','title', 'genres'), on='movieId').show(truncate=False)

In [30]:
print("Max Rec Stats")
print_reccomendations(user_id_with_max_rec_rating, top_5_recommendations, train, test, movies_df)

Max Rec Stats
user 18230 top preferences:


                                                                                

+------+------------+----------+-------+-----------------------------------+--------------------+
|userId|rec_movie_id|rec_rating|movieId|title                              |genres              |
+------+------------+----------+-------+-----------------------------------+--------------------+
|18230 |118760      |6.4984093 |118760 |The Good Lie (2014)                |Drama               |
|18230 |8377        |6.2914524 |8377   |City of Joy (1992)                 |Drama               |
|18230 |163512      |6.2710433 |163512 |Aquarius (2016)                    |Drama               |
|18230 |142875      |6.119466  |142875 |Star (2014)                        |Comedy|Drama|Romance|
|18230 |25941       |6.0998225 |25941  |Letter from an Unknown Woman (1948)|Drama|Romance       |
+------+------------+----------+-------+-----------------------------------+--------------------+

user 18230 train data:


                                                                                

+-------+------+------+-------------------------------------+----------------------------------------+
|movieId|userId|rating|title                                |genres                                  |
+-------+------+------+-------------------------------------+----------------------------------------+
|5      |18230 |5.0   |Father of the Bride Part II (1995)   |Comedy                                  |
|31     |18230 |5.0   |Dangerous Minds (1995)               |Drama                                   |
|48     |18230 |5.0   |Pocahontas (1995)                    |Animation|Children|Drama|Musical|Romance|
|105    |18230 |5.0   |Bridges of Madison County, The (1995)|Drama|Romance                           |
|256    |18230 |5.0   |Junior (1994)                        |Comedy|Sci-Fi                           |
|355    |18230 |5.0   |Flintstones, The (1994)              |Children|Comedy|Fantasy                 |
|596    |18230 |5.0   |Pinocchio (1940)                     |Animation|Ch

In [31]:
print("Min Rec Stats")
print_reccomendations(user_id_with_min_rec_rating, top_5_recommendations, train, test, movies_df)

Min Rec Stats
user 86408 top preferences:


                                                                                

+------+------------+----------+-------+----------------------------------------------+--------------------------------+
|userId|rec_movie_id|rec_rating|movieId|title                                         |genres                          |
+------+------------+----------+-------+----------------------------------------------+--------------------------------+
|86408 |163512      |0.5783854 |163512 |Aquarius (2016)                               |Drama                           |
|86408 |170425      |0.5776877 |170425 |Monsieur & Madame Adelman (2017)              |Comedy|Drama                    |
|86408 |25941       |0.5721399 |25941  |Letter from an Unknown Woman (1948)           |Drama|Romance                   |
|86408 |8377        |0.5697266 |8377   |City of Joy (1992)                            |Drama                           |
|86408 |26875       |0.5628935 |26875  |Pure Formality, A (Pura formalità, Una) (1994)|Crime|Film-Noir|Mystery|Thriller|
+------+------------+----------+

                                                                                

+-------+------+------+--------------------------------------+----------------------+
|movieId|userId|rating|title                                 |genres                |
+-------+------+------+--------------------------------------+----------------------+
|318    |86408 |0.5   |Shawshank Redemption, The (1994)      |Crime|Drama           |
|508    |86408 |0.5   |Philadelphia (1993)                   |Drama                 |
|524    |86408 |0.5   |Rudy (1993)                           |Drama                 |
|1193   |86408 |0.5   |One Flew Over the Cuckoo's Nest (1975)|Drama                 |
|1225   |86408 |0.5   |Amadeus (1984)                        |Drama                 |
|1271   |86408 |0.5   |Fried Green Tomatoes (1991)           |Comedy|Crime|Drama    |
|1302   |86408 |0.5   |Field of Dreams (1989)                |Children|Drama|Fantasy|
|1673   |86408 |0.5   |Boogie Nights (1997)                  |Drama                 |
|1682   |86408 |0.5   |Truman Show, The (1998)        

In [32]:
rand_id = test.rdd.takeSample(False, 1, seed=0)[0][0]
print("Rand Rec Stats")
print_reccomendations(rand_id, top_5_recommendations, train, test, movies_df)

                                                                                

Rand Rec Stats
user 44730 top preferences:


                                                                                

+------+------------+----------+-------+-------------------------------------------------------+-------------+
|userId|rec_movie_id|rec_rating|movieId|title                                                  |genres       |
+------+------------+----------+-------+-------------------------------------------------------+-------------+
|44730 |118760      |4.452896  |118760 |The Good Lie (2014)                                    |Drama        |
|44730 |25941       |4.4283395 |25941  |Letter from an Unknown Woman (1948)                    |Drama|Romance|
|44730 |26520       |4.425784  |26520  |Full Moon in Paris (Les nuits de la pleine lune) (1984)|Drama|Romance|
|44730 |8377        |4.4134445 |8377   |City of Joy (1992)                                     |Drama        |
|44730 |147330      |4.4011326 |147330 |Sherlock Holmes and Dr. Watson: Acquaintance (1979)    |Crime        |
+------+------------+----------+-------+-------------------------------------------------------+-------------+



                                                                                

+-------+------+------+--------------------------------------+-------------------------------------------+
|movieId|userId|rating|title                                 |genres                                     |
+-------+------+------+--------------------------------------+-------------------------------------------+
|282    |44730 |2.0   |Nell (1994)                           |Drama                                      |
|364    |44730 |4.0   |Lion King, The (1994)                 |Adventure|Animation|Children|Drama|Musical |
|466    |44730 |4.0   |Hot Shots! Part Deux (1993)           |Action|Comedy|War                          |
|543    |44730 |3.5   |So I Married an Axe Murderer (1993)   |Comedy|Romance|Thriller                    |
|586    |44730 |3.5   |Home Alone (1990)                     |Children|Comedy                            |
|588    |44730 |4.0   |Aladdin (1992)                        |Adventure|Animation|Children|Comedy|Musical|
|593    |44730 |4.0   |Silence of the

In [33]:
# spark.stop()