In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=a359eb3b266803f990cf03338d56a5bdf94475a1d55d2f05a8dde8d1c4a8ae8f
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row, SparkSession

In [6]:
# Create a SparkSession
spark = SparkSession.builder \
        .master("local") \
        .appName("myApp") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.sql.session.timeout", "48h") \
        .getOrCreate()

In [9]:
lines = spark.read.text("./sample_data/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [20]:
max_iters = [5, 10, 20]
reg_params = [0.1, 0.5, 1.0]

In [24]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
for max_iter in max_iters:
  for reg_param in reg_params:
    als = ALS(maxIter=max_iter, regParam=reg_param, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)
    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error for max_iter="+ str(max_iter) + " and reg_param=" + str(reg_param) + " is " + str(rmse))

Root-mean-square error for max_iter=5 and reg_param=0.1 is 1.0076346872213067
Root-mean-square error for max_iter=5 and reg_param=0.5 is 1.1860752829689025
Root-mean-square error for max_iter=5 and reg_param=1.0 is 1.4561520789487696
Root-mean-square error for max_iter=10 and reg_param=0.1 is 0.9731089097261111
Root-mean-square error for max_iter=10 and reg_param=0.5 is 1.1850439989125914
Root-mean-square error for max_iter=10 and reg_param=1.0 is 1.4561520736969142
Root-mean-square error for max_iter=20 and reg_param=0.1 is 0.9601777435202717
Root-mean-square error for max_iter=20 and reg_param=0.5 is 1.1852264469153184
Root-mean-square error for max_iter=20 and reg_param=1.0 is 1.4561520814341633


As we can see, by set max_iter=20 and reg_param=0.1 will give us small rmse that is 0.9601777435202717.

In [25]:
# Use maxIter = 20 and regParam = 0.1
als = ALS(maxIter=20, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9601777435202717


In [30]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 4.109164}, ...|
|    10|[{49, 2.5834336},...|
|     0|[{92, 3.2258756},...|
|     1|[{68, 3.158934}, ...|
|    21|[{29, 4.399133}, ...|
|    11|[{18, 4.786726}, ...|
|    12|[{46, 4.6337705},...|
|    22|[{74, 4.6178}, {8...|
|     2|[{93, 4.5418878},...|
|    13|[{93, 3.108034}, ...|
|     3|[{18, 3.4956603},...|
|    23|[{46, 4.954659}, ...|
|     4|[{29, 3.3519177},...|
|    24|[{29, 4.484409}, ...|
|    14|[{29, 4.66772}, {...|
|     5|[{46, 4.1608276},...|
|    15|[{46, 3.8676603},...|
|    25|[{25, 3.0491807},...|
|    26|[{94, 4.6550374},...|
|     6|[{25, 3.8814223},...|
+------+--------------------+
only showing top 20 rows

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 4.167}, {5,...|
|     40|[{2, 3.4115524}, ...|
|     10|[{23, 3.3799043},...|
|     50|[{23, 3.7085476},...|
|     80|[{18, 2.8414414},...|
|     

In [31]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{94, 4.6550374},...|
|    19|[{94, 3.4853394},...|
|    29|[{46, 4.2793827},...|
+------+--------------------+



In [32]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{23, 4.140495}, ...|
|     26|[{28, 2.4152143},...|
|     29|[{8, 4.8092146}, ...|
+-------+--------------------+

