# Movie Ratings Matrix Factorization (Collaborative Filtering)

### Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.tuning import TrainValidationSplitModel
import time
import matplotlib.pyplot as plt

### Environment Variables

In [2]:
write_files = False
fit_model = False
model_path = "file:///home/work/data/als_recommender_model"

### Spark Session

In [3]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'MatrixFactorization'),
                                   ('spark.memory.offHeap.enabled', True),
                                   ('spark.memory.offHeap.size','4g'),
                                   ('spark.executor.memory', '4g'), 
                                   ('spark.driver.memory','6g')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-30 23:00:25,196 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Spark Context

In [4]:
sc = spark.sparkContext
#setting log level off 
sc.setLogLevel("off")

### Load Final Ratings Train Data

In [5]:
train = spark.read.csv("file:///home/work/data/als_ratings_train", inferSchema=True, header=True)

                                                                                

### Load Final Ratings Test Data

In [6]:
test = spark.read.csv("file:///home/work/data/als_ratings_test", inferSchema=True, header=True)

                                                                                

## Building ALS model

### Alternating Least Squares (ALS) matrix factorization

In [7]:
#Alternating Least Squares (ALS) matrix factorization
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, #setting this to true since we are using ratings > 0.
          implicitPrefs=False, #setting this to false as we are using explicit ratings.
          coldStartStrategy='drop', # to make sure we don't get NaN evaluation metrics
)

### Hyperparameter Tuning

In [8]:
param_grid = ParamGridBuilder() \
                .addGrid(als.rank,[50, 75, 100, 125]) \
                .addGrid(als.regParam,[.1, .2, .3, .4]) \
                .build()

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')                                

In [10]:
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=1, seed=0)


In [11]:
start = time.time()
if fit_model:
    tvs_model = tvs.fit(train)
tot_runtime = time.time() - start
print(f"Total runtime to fit ALS optimization for all the grids: {tot_runtime}")

Total runtime to fit ALS optimization for all the grids: 0.00018358230590820312


### Save Tuned Model

In [12]:
if write_files:
    tvs_model.save(model_path)

### Read Saved Model 

In [13]:
tvsModelRead = TrainValidationSplitModel.read().load(model_path)

                                                                                

### Check validation Metrics i.e RMSE for all the models

In [14]:
tvsModelRead.validationMetrics

[0.8905777707351172,
 0.9010232162538442,
 0.9454123025737626,
 0.9913658875456827,
 0.8896598583663039,
 0.9012318321097895,
 0.9456857052263995,
 0.9915404477757173,
 0.8898603884721867,
 0.9015462862606608,
 0.9458674191100008,
 0.991629804403516,
 0.8892534517948678,
 0.90146664920257,
 0.9458295484781136,
 0.9915637274683925]

### Explain all the parameters from Hyperparameter Tuning

In [15]:
tvsModelRead.explainParams()

"estimator: estimator to be cross-validated (current: ALS_b6e83d8d5f39)\nestimatorParamMaps: estimator param maps (current: [{Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.1}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.2}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.3}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 50, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regularization parameter (>= 0).'): 0.4}, {Param(parent='ALS_b6e83d8d5f39', name='rank', doc='rank of the factorization'): 75, Param(parent='ALS_b6e83d8d5f39', name='regParam', doc='regular

### Best Model after tuning

In [16]:
best_model = tvsModelRead.bestModel

In [17]:
print("Best Model Train RMSE = ",evaluator.evaluate(best_model.transform(train)))



Best Model Train RMSE =  0.6290511606387795


                                                                                

In [18]:
print("Best Model Test RMSE = ",evaluator.evaluate(best_model.transform(test)))



Best Model Test RMSE =  0.8647913965554024


                                                                                

### Model factors

In [19]:
print("Model Rank:",best_model.rank)
user_factors = best_model.userFactors.orderBy('id').limit(1).collect()
print(f"User Latent Factors for ID: {user_factors[0][0]} :- \n{user_factors[0][1]} ")



Model Rank: 125




User Latent Factors for ID: 1 :- 
[0.23951619863510132, 0.021939320489764214, 0.16278468072414398, 0.14137627184391022, 0.38238295912742615, 0.0, 0.027870215475559235, 0.09165193140506744, 0.1637464314699173, 0.24965836107730865, 0.3340897262096405, 0.18262438476085663, 0.4963119626045227, 0.10208820551633835, 0.003951970487833023, 0.2899414896965027, 0.11626213788986206, 0.010605011135339737, 0.25845080614089966, 0.005914095789194107, 0.016693610697984695, 0.038840558379888535, 0.20357799530029297, 0.35085204243659973, 0.33390697836875916, 0.11779268831014633, 0.2005985975265503, 0.040152113884687424, 0.1704455316066742, 0.1159241795539856, 0.3218660354614258, 0.10789263248443604, 0.08509869128465652, 0.2192203253507614, 0.14105072617530823, 0.005512623116374016, 0.6597520709037781, 0.059967949986457825, 0.047319069504737854, 0.1724238246679306, 0.1600998193025589, 0.06128586828708649, 0.2078404575586319, 0.06138043478131294, 0.26897597312927246, 0.2640194892883301, 0.0368093661963939

                                                                                

In [20]:
movies_df = spark.read.option("header",True).csv("file:///home/work/data/cleaned_movies", inferSchema=True)

### Top 5 recommended Movies

In [21]:
recommendations = best_model.recommendForAllUsers(5)
recommendations.show(10,truncate=False)



+------+--------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                         |
+------+--------------------------------------------------------------------------------------------------------+
|1     |[{26520, 4.9192705}, {2632, 4.610244}, {8629, 4.5221386}, {8377, 4.4997187}, {7096, 4.4996743}]         |
|6     |[{25941, 5.2045436}, {5911, 4.9134603}, {31086, 4.8424063}, {146327, 4.8311415}, {8377, 4.817455}]      |
|16    |[{163512, 5.1467366}, {170425, 5.1442766}, {25941, 5.127475}, {72850, 5.077169}, {152842, 5.048527}]    |
|22    |[{118760, 5.2957597}, {36799, 5.0936933}, {8377, 5.0924625}, {170777, 5.0544353}, {6411, 5.034553}]     |
|26    |[{163512, 4.3564367}, {170425, 4.3407555}, {163112, 4.2501783}, {32705, 4.2499933}, {179795, 4.2499933}]|
|27    |[{185571, 4.915963}, {96991, 4.8671374}, {113315, 4.839786}, {118760, 4.821211},

                                                                                

In [22]:
top_5_recommendations = recommendations.withColumn('recommendation', explode('recommendations')) \
                .select('userId',col('recommendation.movieId').alias('rec_movie_id'),col('recommendation.rating').alias('rec_rating'))


In [23]:
top_5_recommendations.select('rec_rating').describe().toPandas()

                                                                                

Unnamed: 0,summary,rec_rating
0,count,554385.0
1,mean,4.620933781641176
2,stddev,0.4763044881822583
3,min,0.5628935
4,max,6.4984093


### Fetching groups of user id and top recommended rating

In [24]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
windowDept = Window.partitionBy("userId").orderBy(col("rec_rating").desc())
top_5_recommendations.withColumn("row",row_number().over(windowDept)) \
  .filter(col("row") == 1).drop("row") \
  .sort('rec_rating') \
  .show()

                                                                                

+------+------------+----------+
|userId|rec_movie_id|rec_rating|
+------+------------+----------+
| 86408|      163512| 0.5783854|
|113767|      118760| 0.5990427|
| 13677|      118760| 0.6035718|
|122697|       25941| 0.6085643|
|  9152|      118760|0.60947174|
|  5821|      163512|0.61109954|
|121218|      163512|0.61887234|
| 61102|      118760| 0.6210604|
| 63044|      118760|0.62464553|
| 38998|      170425| 0.6257034|
| 60674|      118760| 0.6270056|
|124552|      118760| 0.6310857|
| 94280|      118760|0.63461524|
| 10364|        8377|0.63687766|
| 37091|         914| 0.7694206|
| 99438|         273| 0.9350204|
|138720|       93432| 1.0294752|
|121401|         223| 1.0479981|
| 15547|        1093| 1.0568098|
| 50119|        2826| 1.0912154|
+------+------------+----------+
only showing top 20 rows



### Fetching max recommended rating !!

In [None]:
max_rec_rating = top_5_recommendations.select(max('rec_rating')).collect()[0][0]
min_rec_rating = top_5_recommendations.select(min('rec_rating')).collect()[0][0]

### Finding the user with max recommended rating

In [26]:
top_5_recommendations.filter(col('rec_rating')==max_rec_rating).toPandas()

                                                                                

Unnamed: 0,userId,rec_movie_id,rec_rating
0,18230,118760,6.498409


In [27]:
user_id_with_max_rec_rating = top_5_recommendations.filter(col('rec_rating')==max_rec_rating).collect()[0][0]
print("User ID with max recommendation rating: ", user_id_with_max_rec_rating)
user_id_with_min_rec_rating = top_5_recommendations.filter(col('rec_rating')==min_rec_rating).collect()[0][0]
print("User ID with min recommendation rating: ", user_id_with_min_rec_rating)

                                                                                

User ID with max recommendation rating:  18230




User ID with min recommendation rating:  86408


                                                                                

In [28]:
def print_reccomendations(u_id, recs, train, test, movies_df):
    print(f"Top recommendations for user = {u_id}:")
    recs.filter(col('userId')==u_id) \
                    .join(movies_df.select('movieId','title', 'genres'), recs.rec_movie_id ==  movies_df.movieId) \
                    .sort('rec_rating', ascending=False) \
                    .show(truncate=False)
    print(f"User = {u_id}'s actual prefernces in train data:")
    train.select('userId','movieId','rating').filter(col('userId')==u_id) \
                .join(movies_df.select('movieId','title', 'genres'), on='movieId') \
                .sort('rating', ascending=False) \
                .show(10,truncate=False)
    print(f"User {u_id}'s actual prefernces in test data:")
    test.select('userId','movieId','rating').filter(col('userId')==u_id) \
                .join(movies_df.select('movieId','title', 'genres'), on='movieId') \
                .sort('rating', ascending=False) \
                .show(10,truncate=False)

In [29]:
#print("Max Rec Stats")
#print_reccomendations(user_id_with_max_rec_rating, top_5_recommendations, train, test, movies_df)

In [30]:
#print("Min Rec Stats")
#print_reccomendations(user_id_with_min_rec_rating, top_5_recommendations, train, test, movies_df)

In [31]:
rand_id = test.rdd.takeSample(False, 1, seed=0)[0][0]
print("Rand Rec Stats")
print_reccomendations(rand_id, top_5_recommendations, train, test, movies_df)

                                                                                

Rand Rec Stats
Top recommendations for user = 44730:


                                                                                

+------+------------+----------+-------+-------------------------------------------------------+-------------+
|userId|rec_movie_id|rec_rating|movieId|title                                                  |genres       |
+------+------------+----------+-------+-------------------------------------------------------+-------------+
|44730 |118760      |4.452896  |118760 |The Good Lie (2014)                                    |Drama        |
|44730 |25941       |4.4283395 |25941  |Letter from an Unknown Woman (1948)                    |Drama|Romance|
|44730 |26520       |4.425784  |26520  |Full Moon in Paris (Les nuits de la pleine lune) (1984)|Drama|Romance|
|44730 |8377        |4.4134445 |8377   |City of Joy (1992)                                     |Drama        |
|44730 |147330      |4.4011326 |147330 |Sherlock Holmes and Dr. Watson: Acquaintance (1979)    |Crime        |
+------+------------+----------+-------+-------------------------------------------------------+-------------+



                                                                                

+-------+------+------+--------------------------------------+------------------------------+
|movieId|userId|rating|title                                 |genres                        |
+-------+------+------+--------------------------------------+------------------------------+
|1408   |44730 |4.5   |Last of the Mohicans, The (1992)      |Action|Romance|War|Western    |
|1500   |44730 |4.5   |Grosse Pointe Blank (1997)            |Comedy|Crime|Romance          |
|904    |44730 |4.5   |Rear Window (1954)                    |Mystery|Thriller              |
|1721   |44730 |4.5   |Titanic (1997)                        |Drama|Romance                 |
|1219   |44730 |4.0   |Psycho (1960)                         |Crime|Horror                  |
|1380   |44730 |4.0   |Grease (1978)                         |Comedy|Musical|Romance        |
|1043   |44730 |4.0   |To Gillian on Her 37th Birthday (1996)|Drama|Romance                 |
|593    |44730 |4.0   |Silence of the Lambs, The (1991)     

[Stage 80:>                                                         (0 + 4) / 4]

+-------+------+------+--------------------------------------+--------------------------------------+
|movieId|userId|rating|title                                 |genres                                |
+-------+------+------+--------------------------------------+--------------------------------------+
|2599   |44730 |4.5   |Election (1999)                       |Comedy                                |
|2797   |44730 |4.5   |Big (1988)                            |Comedy|Drama|Fantasy|Romance          |
|953    |44730 |4.5   |It's a Wonderful Life (1946)          |Children|Drama|Fantasy|Romance        |
|1923   |44730 |4.5   |There's Something About Mary (1998)   |Comedy|Romance                        |
|6503   |44730 |4.0   |Charlie's Angels: Full Throttle (2003)|Action|Adventure|Comedy|Crime|Thriller|
|1207   |44730 |4.0   |To Kill a Mockingbird (1962)          |Drama                                 |
|7771   |44730 |4.0   |Zorba the Greek (Alexis Zorbas) (1964)|Adventure|Drama     

                                                                                

### Top 5 recommended Users

In [32]:
#Top 5 Users for movies
top_5_user_recommendations = best_model.recommendForAllItems(5) \
                .withColumn('recommendation', explode('recommendations')) \
                .select('movieId',col('recommendation.userId').alias('rec_user_id'),col('recommendation.rating').alias('rec_rating'))
top_5_user_recommendations.show(truncate=False)



+-------+-----------+----------+
|movieId|rec_user_id|rec_rating|
+-------+-----------+----------+
|1      |137831     |5.3221583 |
|1      |18230      |5.228027  |
|1      |27173      |5.2063    |
|1      |66365      |5.1933675 |
|1      |89631      |5.1882453 |
|12     |110061     |4.5251412 |
|12     |96471      |4.4960876 |
|12     |50715      |4.4597187 |
|12     |34979      |4.437086  |
|12     |93649      |4.4292912 |
|13     |52924      |4.7441325 |
|13     |54192      |4.66887   |
|13     |108346     |4.664882  |
|13     |18230      |4.639894  |
|13     |151238     |4.638042  |
|22     |96471      |5.019087  |
|22     |86668      |4.920675  |
|22     |74833      |4.9188876 |
|22     |70505      |4.9117274 |
|22     |66365      |4.8727326 |
+-------+-----------+----------+
only showing top 20 rows



                                                                                

In [33]:
#Show top 5 user recommendation for movieId = 1
top_5_user_recommendations.filter(col('movieId')==1) \
                    .join(movies_df.select('movieId', 'title', 'genres'), top_5_user_recommendations.movieId ==  movies_df.movieId) \
                    .sort('rec_rating', ascending=False) \
                    .show(truncate=False)



+-------+-----------+----------+-------+----------------+-------------------------------------------+
|movieId|rec_user_id|rec_rating|movieId|title           |genres                                     |
+-------+-----------+----------+-------+----------------+-------------------------------------------+
|1      |137831     |5.3221583 |1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|
|1      |18230      |5.228027  |1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|
|1      |27173      |5.2063    |1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|
|1      |66365      |5.1933675 |1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|
|1      |89631      |5.1882453 |1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|
+-------+-----------+----------+-------+----------------+-------------------------------------------+



                                                                                

In [34]:
#Top 5 users recommended for movieId = 1
top_5_user_recommendations = top_5_user_recommendations.filter(col('movieId')==1) \
                    .join(movies_df.select('movieId', 'title', 'genres'), top_5_user_recommendations.movieId ==  movies_df.movieId) \
                    .sort('rec_rating', ascending=False) \
                    .collect()

                                                                                

In [35]:
import numpy as np
user_list = list(np.asarray(top_5_user_recommendations)[:,1])
print("Users we want to validate for movie recommendations",user_list)

Users we want to validate for movie recommendations ['137831', '18230', '27173', '66365', '89631']


#### Let's check the top rated movie genres by top 5 users recommended for movieId = 1

In [36]:
#Sample user's actual top 3 choices
windowDept = Window.partitionBy("userId").orderBy(col("rating").desc())
train.filter(train.userId.isin(user_list)) \
   .withColumn("row",row_number().over(windowDept)) \
  .filter(col("row") <= 3).drop("row","avg_rating") \
  .sort('userId','rating', ascending=False) \
  .show(truncate=False)



+------+-------+------+----------------------------------------+
|userId|movieId|rating|genres                                  |
+------+-------+------+----------------------------------------+
|137831|107    |5.0   |Adventure|Children|Comedy|Musical       |
|137831|104    |5.0   |Comedy                                  |
|137831|158    |5.0   |Adventure|Children                      |
|89631 |246    |5.0   |Documentary                             |
|89631 |31     |5.0   |Drama                                   |
|89631 |355    |5.0   |Children|Comedy|Fantasy                 |
|66365 |158    |5.0   |Adventure|Children                      |
|66365 |58     |5.0   |Comedy|Drama|Romance                    |
|66365 |70     |5.0   |Action|Comedy|Horror|Thriller           |
|27173 |34     |5.0   |Children|Drama                          |
|27173 |110    |5.0   |Action|Drama|War                        |
|27173 |10     |5.0   |Action|Adventure|Thriller               |
|18230 |5      |5.0   |Co

                                                                                

In [37]:
spark.stop()