In [1]:
# use the movie lens dataset to build a recommender system
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('rec').getOrCreate()

In [3]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
data = spark.read.csv('FileStore/tables/movielens_ratings.csv', inferSchema = True, header = True)

In [5]:
data.show()

In [6]:
data.describe().show()

In [7]:
# train/test split
training, test = data.randomSplit([0.8, 0.2])

In [8]:
als = ALS(maxIter = 5, regParam = 0.01, userCol = 'userId', itemCol = 'movieId', ratingCol = 'rating')

In [9]:
model = als.fit(training)

In [10]:
prediction = model.transform(test)

In [11]:
prediction.show()
# rating is the real rating of the user towards the item, 
# prediction is the predicted rating
# look at the difference to know how good/bad the model is performing

In [12]:
# evaluate the model formally
evaluator = RegressionEvaluator(metricName='rmse', labelCol = 'rating', predictionCol = 'prediction')

In [13]:
rmse = evaluator.evaluate(prediction)

In [14]:
print('RMSE')
print(rmse)
# star rating is from 1-5, the RMSE printed below is too big 
# considering the size of the data is not big enough
# its not that surprising that the model works so bad

In [15]:
# how to recommend a movie to a single user? 
single_user = test.filter(test['userid']==11).select(['movieId', 'userId'])

In [16]:
# predict the rating of the user of the movies he/she hasn't seen yet
# recommend the one with the highest predicted rating
single_user.show()

In [17]:

recommendations = model.transform(single_user)

In [18]:
recommendations.orderBy('prediction', ascending = False).show()

In [19]:
# cold start problem: users who haven't watched any movie on this platform
# could use survey: which movies do you like
# or: could ask which one typical customer profile are you most close to 
# but in general, the cold start problem is a challenge in recommender systems