# Movie Ratings Matrix Factorization (Collaborative Filtering)

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

## Spark Session

In [None]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

import matplotlib.pyplot as plt

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'MatrixFactorization'),
                                   ('spark.memory.offHeap.enabled', True),
                                   ('spark.memory.offHeap.size','4g'),
                                   ('spark.executor.memory', '4g'), 
                                   ('spark.driver.memory','6g')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
spark.sparkContext.setLogLevel("off")

In [None]:
spark,

## Load final ratings files

In [None]:
ratings_df = spark.read.csv("file:///home/work/data/ratings_100_max.csv", inferSchema=True, header=True).repartition(100)
ratings_df.printSchema()

In [None]:
# train, test = ratings_df.randomSplit([0.8, 0.2], seed=0)
train, test = ratings_df.randomSplit([0.5, 0.5], seed=0)

In [None]:
#checking number of partitions 
# train.rdd.getNumPartitions()
ratings_df.unpersist()
train.unpersist()

## Building ALS model

### Alternating Least Squares (ALS) matrix factorization

In [None]:
#Alternating Least Squares (ALS) matrix factorization
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          nonnegative=True, #setting this to true since we are using ratings > 0.
          implicitPrefs=False, #setting this to false as we are using explicit ratings.
          coldStartStrategy='drop', # to make sure we don't get NaN evaluation metrics
          maxIter=15
)

### Hyperparameter Tuning

In [None]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder() \
                .addGrid(als.rank,[50, 75, 100, 125]) \
                .addGrid(als.regParam,[.1, .2, .3, .4]) \
                .build()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')                                

In [None]:
# from pyspark.ml.tuning import CrossValidator
# cv = CrossValidator(estimator=als,
#                     estimatorParamMaps=param_grid,
#                     evaluator=evaluator,
#                     numFolds=10)
# cv.fit(test)
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=2, seed=0)


In [None]:
%%time
tvs_model = tvs.fit(train)

In [None]:
%%time
print("Test RMSE = ",evaluator.evaluate(best_model.transform(test)))

In [None]:
print("Train RMSE = ",evaluator.evaluate(best_model.transform(train)))

In [None]:
model_path = "file:///home/work/data/als_model_v2.0"

In [None]:
write_files = True
if write_files:
    tvs_model.save(model_path)

In [None]:
from pyspark.ml.tuning import TrainValidationSplitModel
tvsModelRead = TrainValidationSplitModel.read().load(model_path)

In [None]:
tvsModelRead.validationMetrics

In [None]:
tvsModelRead.explainParams()

In [None]:
best_model = tvsModelRead.bestModel

In [None]:
print("Best Model Train RMSE = ",evaluator.evaluate(best_model.transform(train)))

In [None]:
%%time
print("Best Model Test RMSE = ",evaluator.evaluate(best_model.transform(test)))

In [None]:
recommendations = best_model.recommendForAllUsers(5)
recommendations.show()

In [None]:
top_5_recommendations = recommendations.withColumn('recommendation', explode('recommendations')) \
                .select('userId',col('recommendation.movieId'),col('recommendation.rating'))


In [None]:
#Recommendations
top_5_recommendations.join(test.select('movieId','genres'), on='movieId') \
                    .filter('userId = 1') \
                    .sort('rating', ascending = False).show(10)

In [None]:
#Actual User Prefernces
test.select('userId','movieId','genres') \
    .filter('userId = 1') \
    .sort('rating', ascending = False).show(10)

In [None]:
spark.stop()