# Movie Ratings Matrix Factorization (Collaborative Filtering)

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

## Spark Session

In [2]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

import matplotlib.pyplot as plt

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'MatrixFactorization'),
                                   ('spark.memory.offHeap.enabled', True),
                                   ('spark.memory.offHeap.size','4g'),
                                   ('spark.executor.memory', '4g'), 
                                   ('spark.driver.memory','6g')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-21 23:00:37,566 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sparkContext.setLogLevel("off")

## Load final ratings files

In [4]:
ratings_df = spark.read.csv("file:///home/work/data/ratings_100_max.csv", inferSchema=True, header=True).repartition(100)
ratings_df.printSchema()



root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- Action: double (nullable = true)
 |-- Adventure: double (nullable = true)
 |-- Animation: double (nullable = true)
 |-- Children: double (nullable = true)
 |-- Comedy: double (nullable = true)
 |-- Crime: double (nullable = true)
 |-- Documentary: double (nullable = true)
 |-- Drama: double (nullable = true)
 |-- Fantasy: double (nullable = true)
 |-- Film-Noir: double (nullable = true)
 |-- Horror: double (nullable = true)
 |-- Musical: double (nullable = true)
 |-- Mystery: double (nullable = true)
 |-- Romance: double (nullable = true)
 |-- Sci-Fi: double (nullable = true)
 |-- Thriller: double (nullable = true)
 |-- War: double (nullable = true)
 |-- Western: double (nullable = true)
 |-- avg_rating: double (nullable = true)



                                                                                

In [5]:
# train, test = ratings_df.randomSplit([0.8, 0.2], seed=0)
train, test = ratings_df.randomSplit([0.5, 0.5], seed=0)

In [6]:
#checking number of partitions 
# train.rdd.getNumPartitions()
ratings_df.unpersist()
train.unpersist()

DataFrame[userId: int, movieId: int, rating: double, year: int, Action: double, Adventure: double, Animation: double, Children: double, Comedy: double, Crime: double, Documentary: double, Drama: double, Fantasy: double, Film-Noir: double, Horror: double, Musical: double, Mystery: double, Romance: double, Sci-Fi: double, Thriller: double, War: double, Western: double, avg_rating: double]

## Building ALS model

### Alternating Least Squares (ALS) matrix factorization

In [7]:
#Alternating Least Squares (ALS) matrix factorization
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, #setting this to true since we are using ratings > 0.
          implicitPrefs=False, #setting this to false as we are using explicit ratings.
          coldStartStrategy='drop' # to make sure we don't get NaN evaluation metrics
)

### Hyperparameter Tuning

In [8]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder() \
                .addGrid(als.rank,[10, 50, 100, 150]) \
                .addGrid(als.regParam,[.01, .05, .1, .15]) \
                .build()

The above will generate 4 x 4 = 16 models for training.

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')                                

In [10]:
# from pyspark.ml.tuning import CrossValidator
# cv = CrossValidator(estimator=als,
#                     estimatorParamMaps=param_grid,
#                     evaluator=evaluator,
#                     numFolds=10)
# cv.fit(test)
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=1, seed=0)


In [11]:
%%time
tvs_model = tvs.fit(test)



CPU times: user 1.62 s, sys: 360 ms, total: 1.98 s
Wall time: 1h 23min 46s


                                                                                

In [14]:
tvs_model.bestModel

ALSModel: uid=ALS_2055d90fade1, rank=100

In [15]:
%%time
evaluator.evaluate(tvs_model.transform(test))



CPU times: user 9.5 ms, sys: 6.99 ms, total: 16.5 ms
Wall time: 28.7 s


                                                                                

0.6888509596295516

In [16]:
evaluator.evaluate(tvs_model.transform(train))

                                                                                

0.8984551544862247

In [20]:
!pwd

/home/work/dse230_project/models


In [23]:
write_files = True
if write_files:
    tvs_model.save("file:///home/work/data/als_model_test")
    # !hadoop fs -ls /tsv_model
    # !hadoop fs -copyToLocal /tsv_model/* /home/work/data/als_model_test

                                                                                

In [19]:
!hadoop fs -rm -r /tsv_model

rm: `/tsv_model': No such file or directory


In [13]:
#spark.stop()