# Movie Ratings Matrix Factorization (Collaborative Filtering)

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *


## Spark Session

In [2]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

import matplotlib.pyplot as plt

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'MatrixFactorization')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-21 07:47:36,564 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-05-21 07:47:38,346 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark.sparkContext.setLogLevel("off")

## Load final ratings files

In [4]:
ratings_df = spark.read.csv("file:///home/work/data/ratings_10.csv", inferSchema=True, header=True).repartition(100)
ratings_df.printSchema()



root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- Action: double (nullable = true)
 |-- Adventure: double (nullable = true)
 |-- Animation: double (nullable = true)
 |-- Children: double (nullable = true)
 |-- Comedy: double (nullable = true)
 |-- Crime: double (nullable = true)
 |-- Documentary: double (nullable = true)
 |-- Drama: double (nullable = true)
 |-- Fantasy: double (nullable = true)
 |-- Film-Noir: double (nullable = true)
 |-- Horror: double (nullable = true)
 |-- Musical: double (nullable = true)
 |-- Mystery: double (nullable = true)
 |-- Romance: double (nullable = true)
 |-- Sci-Fi: double (nullable = true)
 |-- Thriller: double (nullable = true)
 |-- War: double (nullable = true)
 |-- Western: double (nullable = true)
 |-- avg_rating: double (nullable = true)



                                                                                

In [5]:
train, test = ratings_df.randomSplit([0.8, 0.2], seed=0)

In [6]:
#checking number of partitions 
train.rdd.getNumPartitions()



100

## Building ALS model

### Alternating Least Squares (ALS) matrix factorization

In [7]:
#Alternating Least Squares (ALS) matrix factorization
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, #setting this to true since we are using ratings > 0.
          implicitPrefs=False, #setting this to false as we are using explicit ratings.
          coldStartStrategy='drop' # to make sure we don't get NaN evaluation metrics
)

### Hyperparameter Tuning

In [8]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder() \
                .addGrid(als.rank,[10, 50, 100, 150]) \
                .addGrid(als.regParam,[.01, .05, .1, .15]) \
                .build()

The above will generate 4 x 4 = 16 models for training.

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')                                

In [10]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=10)

In [None]:
cv.fit(test)



In [22]:
#spark.stop()

ConnectionRefusedError: [Errno 111] Connection refused