In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.context import SparkContext
# Import Apache Spark SQL
from pyspark.sql import SparkSession

# Create Spark Session/Context
# We are using local machine with all the CPU cores [*]
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Hello Pyspark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
lines = spark.read.text("sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [3]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=20, regParam=1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [4]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.5277466600446912


In [5]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als2 = ALS(maxIter=20, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model2 = als2.fit(training)
# Evaluate the model by computing the RMSE on the test data
predictions2 = model2.transform(test)
evaluator2 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator2.evaluate(predictions2)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0638611684649135


In [6]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als3 = ALS(maxIter=10, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model3 = als3.fit(training)
# Evaluate the model by computing the RMSE on the test data
predictions3 = model3.transform(test)
evaluator3 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator3.evaluate(predictions3)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.2575275346967845


In [7]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als4 = ALS(maxIter=5, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model4 = als4.fit(training)
# Evaluate the model by computing the RMSE on the test data
predictions4 = model4.transform(test)
evaluator4 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator4.evaluate(predictions4)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.5277441005627375


In [12]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als2 = ALS(maxIter=20, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model2 = als2.fit(training)
# Evaluate the model by computing the RMSE on the test data
predictions2 = model2.transform(test)
evaluator2 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator2.evaluate(predictions2)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.257690528002235


In [8]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.head()

Row(userId=20, recommendations=[Row(movieId=32, rating=1.4831053018569946), Row(movieId=49, rating=1.3540387153625488), Row(movieId=94, rating=1.3452517986297607), Row(movieId=30, rating=1.2596983909606934), Row(movieId=90, rating=1.2493435144424438), Row(movieId=18, rating=1.2146931886672974), Row(movieId=52, rating=1.2136410474777222), Row(movieId=68, rating=1.1748870611190796), Row(movieId=27, rating=1.162322759628296), Row(movieId=62, rating=1.1508246660232544)])

In [9]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.head()

Row(movieId=20, recommendations=[Row(userId=26, rating=0.5942370295524597), Row(userId=23, rating=0.583170473575592), Row(userId=11, rating=0.5588732361793518), Row(userId=22, rating=0.5500363707542419), Row(userId=24, rating=0.5025194883346558), Row(userId=8, rating=0.49650076031684875), Row(userId=12, rating=0.4912295937538147), Row(userId=3, rating=0.4894353449344635), Row(userId=2, rating=0.4868123531341553), Row(userId=20, rating=0.47705399990081787)])

In [10]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.head()

Row(userId=26, recommendations=[Row(movieId=32, rating=1.8474136590957642), Row(movieId=49, rating=1.6866432428359985), Row(movieId=94, rating=1.6756980419158936), Row(movieId=30, rating=1.5691293478012085), Row(movieId=90, rating=1.556230902671814), Row(movieId=18, rating=1.5130691528320312), Row(movieId=52, rating=1.5117586851119995), Row(movieId=68, rating=1.4634851217269897), Row(movieId=27, rating=1.4478346109390259), Row(movieId=62, rating=1.4335118532180786)])

In [11]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.head()

Row(movieId=65, recommendations=[Row(userId=26, rating=1.0939332246780396), Row(userId=23, rating=1.0735608339309692), Row(userId=11, rating=1.028831958770752), Row(userId=22, rating=1.0125640630722046), Row(userId=24, rating=0.9250901341438293), Row(userId=8, rating=0.9140101671218872), Row(userId=12, rating=0.9043064117431641), Row(userId=3, rating=0.9010034799575806), Row(userId=2, rating=0.8961748480796814), Row(userId=20, rating=0.8782106041908264)])

# Best parameters
According to the RMSE the best parameters is 20 for max Iter and 0.1 for reg param with the rmse of 1.06


In [13]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als2 = ALS(maxIter=20, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model2 = als2.fit(training)
# Evaluate the model by computing the RMSE on the test data
predictions2 = model2.transform(test)
evaluator2 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator2.evaluate(predictions2)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0638611684649135
