In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark ALS MovieLens").master("local[*]").config("spark.driver.memory","2G").getOrCreate()

In [2]:
rating_df = spark.read.options(delimiter="::", inferSchema=True).csv("./ml-10M100K/ratings.dat").toDF("user", "movie", "rating", "timestamp")
rating_df.show()
rating_df.count()

+----+-----+------+---------+
|user|movie|rating|timestamp|
+----+-----+------+---------+
|   1|  122|   5.0|838985046|
|   1|  185|   5.0|838983525|
|   1|  231|   5.0|838983392|
|   1|  292|   5.0|838983421|
|   1|  316|   5.0|838983392|
|   1|  329|   5.0|838983392|
|   1|  355|   5.0|838984474|
|   1|  356|   5.0|838983653|
|   1|  362|   5.0|838984885|
|   1|  364|   5.0|838983707|
|   1|  370|   5.0|838984596|
|   1|  377|   5.0|838983834|
|   1|  420|   5.0|838983834|
|   1|  466|   5.0|838984679|
|   1|  480|   5.0|838983653|
|   1|  520|   5.0|838984679|
|   1|  539|   5.0|838984068|
|   1|  586|   5.0|838984068|
|   1|  588|   5.0|838983339|
|   1|  589|   5.0|838983778|
+----+-----+------+---------+
only showing top 20 rows



10000054

In [3]:
rating_df.printSchema()

root
 |-- user: integer (nullable = true)
 |-- movie: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [4]:
movie_df = spark.read.options(delimiter="::", inferSchema=True).csv("./ml-10M100K/movies.dat").toDF("movie","title","genres")
movie_df.show()
movie_df.count()

+-----+--------------------+--------------------+
|movie|               title|              genres|
+-----+--------------------+--------------------+
|    1|    Toy Story (1995)|Adventure|Animati...|
|    2|      Jumanji (1995)|Adventure|Childre...|
|    3|Grumpier Old Men ...|      Comedy|Romance|
|    4|Waiting to Exhale...|Comedy|Drama|Romance|
|    5|Father of the Bri...|              Comedy|
|    6|         Heat (1995)|Action|Crime|Thri...|
|    7|      Sabrina (1995)|      Comedy|Romance|
|    8| Tom and Huck (1995)|  Adventure|Children|
|    9| Sudden Death (1995)|              Action|
|   10|    GoldenEye (1995)|Action|Adventure|...|
|   11|American Presiden...|Comedy|Drama|Romance|
|   12|Dracula: Dead and...|       Comedy|Horror|
|   13|        Balto (1995)|  Animation|Children|
|   14|        Nixon (1995)|               Drama|
|   15|Cutthroat Island ...|Action|Adventure|...|
|   16|       Casino (1995)|         Crime|Drama|
|   17|Sense and Sensibi...|Comedy|Drama|Romance|


10681

In [5]:
rating_df.select('user').distinct().count()

69878

In [6]:
rating_df.select('movie').distinct().count()

10677

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

(training, test) = rating_df.randomSplit([0.8, 0.2])

In [8]:
als = ALS(rank=50, maxIter=10, regParam=0.01, userCol="user", itemCol="movie", ratingCol="rating", coldStartStrategy="drop")
als_model = als.fit(training)

In [9]:
predictions = als_model.transform(test)
evaluate = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [10]:
predictions.show()

+----+-----+------+----------+----------+
|user|movie|rating| timestamp|prediction|
+----+-----+------+----------+----------+
|  34| 1645|   3.0| 981824935| 3.5155551|
|  78| 1238|   4.0|1089332965|  4.644592|
| 193|  471|   2.0| 834884702| 3.0558429|
| 211| 2366|   3.0| 945880564| 3.8609197|
| 243| 1580|   4.0| 957754091| 3.3620322|
| 251| 1342|   3.0| 857935706|   2.97529|
| 368| 1088|   3.0|1079497543| 2.2806375|
| 368| 1591|   2.0| 959913362| 2.8712552|
| 375| 1580|   4.0| 945890762| 3.7205749|
| 463| 3175|   5.0| 959928331|  3.490912|
| 496| 2122|   5.0|1022347867| 4.6039705|
| 513| 2366|   3.0| 945892327| 3.8949018|
| 683| 1645|   2.0|1084463816|  3.086763|
| 683| 5803|   4.0|1084467353| 3.9383574|
| 898| 1580|   3.0| 948417027| 3.8344905|
|1084| 1645|   3.0| 941672162|  4.181403|
|1088| 2366|   5.0| 948472675| 3.2913835|
|1143| 1580|   5.0| 868401181| 4.3149858|
|1198| 1580|   3.0|1188661152|   2.99805|
|1265| 2142|   1.0| 942069714| 2.5040548|
+----+-----+------+----------+----

In [11]:
rmse = evaluate.evaluate(predictions)
print(rmse)

0.8789375752306133


In [12]:
user_recommendations = als_model.recommendForAllUsers(10)
user_recommendations.show()

+----+--------------------+
|user|     recommendations|
+----+--------------------+
|  34|[{4147, 5.187317}...|
|  53|[{63131, 7.019035...|
|  65|[{27648, 5.382845...|
|  78|[{4984, 5.602451}...|
|  85|[{6127, 7.8254423...|
| 108|[{6843, 6.453687}...|
| 137|[{1555, 5.0681005...|
| 148|[{7992, 7.0827208...|
| 155|[{43708, 5.447749...|
| 193|[{25856, 5.735428...|
| 211|[{4984, 6.5412374...|
| 243|[{2627, 7.3930717...|
| 251|[{25753, 6.182688...|
| 255|[{5607, 6.1743064...|
| 296|[{32444, 4.976081...|
| 321|[{4201, 6.6201468...|
| 322|[{1773, 7.9488263...|
| 362|[{7002, 7.9144716...|
| 368|[{5924, 5.3957434...|
| 375|[{4952, 7.8887124...|
+----+--------------------+
only showing top 20 rows



In [13]:
users = rating_df.select("user").distinct().limit(5)
users.show()

+----+
|user|
+----+
| 148|
| 463|
| 471|
| 496|
| 833|
+----+



In [14]:
user_sub_recommendations = als_model.recommendForUserSubset(users, 10)
user_sub_recommendations.show(truncate=False)

+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user|recommendations                                                                                                                                                                           |
+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|471 |[{55267, 7.451526}, {6522, 7.3697414}, {6784, 7.062359}, {4208, 6.839902}, {3319, 6.7705474}, {1685, 6.7576485}, {1420, 6.6014614}, {4438, 6.5824327}, {2158, 6.525417}, {6260, 6.492599}]|
|463 |[{1555, 5.591772}, {25961, 5.033029}, {593, 5.000145}, {260, 4.9220104}, {32444, 4.8941374}, {2762, 4.893883}, {7767, 4.8551598}, {31116, 4.8466377}, {1198, 4.82191}, {110, 4.821346}]   |
|833 |[{32444, 5.826062}, {638

In [15]:
user_sub_recommendations.take(1)

[Row(user=471, recommendations=[Row(movie=55267, rating=7.451526165008545), Row(movie=6522, rating=7.369741439819336), Row(movie=6784, rating=7.062358856201172), Row(movie=4208, rating=6.839901924133301), Row(movie=3319, rating=6.770547389984131), Row(movie=1685, rating=6.757648468017578), Row(movie=1420, rating=6.601461410522461), Row(movie=4438, rating=6.582432746887207), Row(movie=2158, rating=6.525416851043701), Row(movie=6260, rating=6.492599010467529)])]

In [16]:
rating_df.where("user=471").show()
rating_df.where("user=471").count()

+----+-----+------+---------+
|user|movie|rating|timestamp|
+----+-----+------+---------+
| 471|    3|   5.0|834924367|
| 471|    5|   5.0|834924367|
| 471|   10|   3.0|834923999|
| 471|   31|   5.0|834924267|
| 471|   34|   4.0|834924068|
| 471|   48|   5.0|834924245|
| 471|  158|   5.0|834924215|
| 471|  236|   5.0|834924153|
| 471|  237|   5.0|834924302|
| 471|  300|   3.0|834924042|
| 471|  317|   3.0|834924068|
| 471|  329|   4.0|834923961|
| 471|  350|   5.0|834924233|
| 471|  356|   3.0|834924153|
| 471|  364|   4.0|834924124|
| 471|  410|   3.0|834924042|
| 471|  454|   3.0|834924098|
| 471|  457|   5.0|834924042|
| 471|  480|   3.0|834924187|
| 471|  500|   2.0|834924280|
+----+-----+------+---------+
only showing top 20 rows



22

In [17]:
rating_df.where("user=471").show(30)

+----+-----+------+---------+
|user|movie|rating|timestamp|
+----+-----+------+---------+
| 471|    3|   5.0|834924367|
| 471|    5|   5.0|834924367|
| 471|   10|   3.0|834923999|
| 471|   31|   5.0|834924267|
| 471|   34|   4.0|834924068|
| 471|   48|   5.0|834924245|
| 471|  158|   5.0|834924215|
| 471|  236|   5.0|834924153|
| 471|  237|   5.0|834924302|
| 471|  300|   3.0|834924042|
| 471|  317|   3.0|834924068|
| 471|  329|   4.0|834923961|
| 471|  350|   5.0|834924233|
| 471|  356|   3.0|834924153|
| 471|  364|   4.0|834924124|
| 471|  410|   3.0|834924042|
| 471|  454|   3.0|834924098|
| 471|  457|   5.0|834924042|
| 471|  480|   3.0|834924187|
| 471|  500|   2.0|834924280|
| 471|  589|   3.0|834924233|
| 471|  595|   3.0|834923961|
+----+-----+------+---------+



In [18]:
user_sub_recommendations.show(truncate=False)

+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user|recommendations                                                                                                                                                                           |
+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|471 |[{55267, 7.451526}, {6522, 7.3697414}, {6784, 7.062359}, {4208, 6.839902}, {3319, 6.7705474}, {1685, 6.7576485}, {1420, 6.6014614}, {4438, 6.5824327}, {2158, 6.525417}, {6260, 6.492599}]|
|463 |[{1555, 5.591772}, {25961, 5.033029}, {593, 5.000145}, {260, 4.9220104}, {32444, 4.8941374}, {2762, 4.893883}, {7767, 4.8551598}, {31116, 4.8466377}, {1198, 4.82191}, {110, 4.821346}]   |
|833 |[{32444, 5.826062}, {638

In [19]:
import pyspark.sql.functions as fn

user_movies = user_sub_recommendations.select("user", fn.explode("recommendations").alias("moving_rating"))
user_movies.show()

+----+------------------+
|user|     moving_rating|
+----+------------------+
| 471| {55267, 7.451526}|
| 471| {6522, 7.3697414}|
| 471|  {6784, 7.062359}|
| 471|  {4208, 6.839902}|
| 471| {3319, 6.7705474}|
| 471| {1685, 6.7576485}|
| 471| {1420, 6.6014614}|
| 471| {4438, 6.5824327}|
| 471|  {2158, 6.525417}|
| 471|  {6260, 6.492599}|
| 463|  {1555, 5.591772}|
| 463| {25961, 5.033029}|
| 463|   {593, 5.000145}|
| 463|  {260, 4.9220104}|
| 463|{32444, 4.8941374}|
| 463|  {2762, 4.893883}|
| 463| {7767, 4.8551598}|
| 463|{31116, 4.8466377}|
| 463|   {1198, 4.82191}|
| 463|   {110, 4.821346}|
+----+------------------+
only showing top 20 rows



In [20]:
movie_df.show()

+-----+--------------------+--------------------+
|movie|               title|              genres|
+-----+--------------------+--------------------+
|    1|    Toy Story (1995)|Adventure|Animati...|
|    2|      Jumanji (1995)|Adventure|Childre...|
|    3|Grumpier Old Men ...|      Comedy|Romance|
|    4|Waiting to Exhale...|Comedy|Drama|Romance|
|    5|Father of the Bri...|              Comedy|
|    6|         Heat (1995)|Action|Crime|Thri...|
|    7|      Sabrina (1995)|      Comedy|Romance|
|    8| Tom and Huck (1995)|  Adventure|Children|
|    9| Sudden Death (1995)|              Action|
|   10|    GoldenEye (1995)|Action|Adventure|...|
|   11|American Presiden...|Comedy|Drama|Romance|
|   12|Dracula: Dead and...|       Comedy|Horror|
|   13|        Balto (1995)|  Animation|Children|
|   14|        Nixon (1995)|               Drama|
|   15|Cutthroat Island ...|Action|Adventure|...|
|   16|       Casino (1995)|         Crime|Drama|
|   17|Sense and Sensibi...|Comedy|Drama|Romance|


In [21]:
user_movies_title = user_movies.join(movie_df, user_movies.moving_rating.movie==movie_df.movie)
user_movies_title.show()

+----+------------------+-----+--------------------+--------------------+
|user|     moving_rating|movie|               title|              genres|
+----+------------------+-----+--------------------+--------------------+
| 471| {55267, 7.451526}|55267|Dan in Real Life ...|Comedy|Drama|Romance|
| 471| {6522, 7.3697414}| 6522|Man's Favorite Sp...|              Comedy|
| 471|  {6784, 7.062359}| 6784|Song Remains the ...| Documentary|Musical|
| 471|  {4208, 6.839902}| 4208|  Unmade Beds (1997)|         Documentary|
| 471| {3319, 6.7705474}| 3319|  Judy Berlin (1999)|               Drama|
| 471| {1685, 6.7576485}| 1685|I Love You, I Lov...|       Drama|Romance|
| 471| {1420, 6.6014614}| 1420|Message to Love: ...|         Documentary|
| 471| {4438, 6.5824327}| 4438|Chinese Connectio...|              Action|
| 471|  {2158, 6.525417}| 2158|Henry: Portrait o...|        Crime|Horror|
| 471|  {6260, 6.492599}| 6260|    Robe, The (1953)|               Drama|
| 463|  {1555, 5.591772}| 1555|To Have