In [66]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
import pandas as pd

# Define user rating columns :
user_rating = ['userId', 'movieId', 'rating']

# Create a Spark session
# spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

spark = SparkSession.builder \
    .appName("MovieRecommendation") \
    .getOrCreate()

# Read data from JSON file into a PySpark DataFrame :
movie_ratings_spark = spark.read.json('movies.json')

# Select relevant columns from the DataFrame :
movie_ratings_spark = movie_ratings_spark.select(user_rating)

In [67]:
movie_ratings_spark.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|   244|      1|     4|
|   298|      1|     5|
|   253|      1|     5|
|   305|      1|     5|
|     6|      1|     4|
|    62|      1|     2|
|   286|      1|     4|
|   200|      1|     5|
|   210|      1|     5|
|   303|      1|     5|
|   194|      1|     4|
|   291|      1|     5|
|   234|      1|     3|
|   299|      1|     3|
|   308|      1|     4|
|    95|      1|     5|
|    38|      1|     5|
|   102|      1|     3|
|    63|      1|     3|
|   160|      1|     4|
+------+-------+------+
only showing top 20 rows



In [68]:
# Create test and train set :
(training, test) = movie_ratings_spark.randomSplit([0.8, 0.2])
                                             
als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

In [69]:
# Evaluate the model by computing the Root Mean Squared Error (RMSE) on the test data :
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 1.0834710745689005


In [70]:
user_id = 55
selected_user_df = movie_ratings_spark.filter(col('userId') == user_id)

user_recommendations = model.recommendForUserSubset(selected_user_df, 5)
user_recommendations.show(truncate=False)

+------+-----------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                |
+------+-----------------------------------------------------------------------------------------------+
|55    |[{1643, 13.916194}, {874, 13.537904}, {1107, 12.0650835}, {904, 11.6401415}, {1176, 11.632163}]|
+------+-----------------------------------------------------------------------------------------------+



In [75]:
# Save the ALS model
model_path = "./best_model"
# model.save(model_path)

# # Load the ALS model later
# loaded_model = ALS.load(model_path)

model.write().save(model_path)

In [78]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALSModel

# Create a Spark session
spark = SparkSession.builder.appName("ModelUsage").getOrCreate()

# Specify the path where the ALS model is saved
model_path = "./best_model"

# Load the ALS model
loaded_model = ALSModel.load(model_path)


user_id = 55
selected_user_df = movie_ratings_spark.filter(col('userId') == user_id)

selected_user_df.show()

# user_recommendations = model.recommendForUserSubset(selected_user_df, 5)
# user_recommendations.show(truncate=False)


+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    55|      7|     3|
|    55|     22|     5|
|    55|     50|     4|
|    55|     56|     4|
|    55|     79|     5|
|    55|     89|     5|
|    55|    117|     3|
|    55|    118|     5|
|    55|    121|     3|
|    55|    144|     5|
|    55|    174|     4|
|    55|    181|     4|
|    55|    254|     2|
|    55|    257|     3|
|    55|    273|     5|
|    55|    405|     1|
|    55|    597|     2|
|    55|    678|     3|
|    55|    685|     1|
|    55|   1016|     1|
+------+-------+------+
only showing top 20 rows



## All Code :

In [64]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
import pandas as pd

# Define user rating columns :
user_rating = ['userId', 'movieId', 'rating']

# Create a Spark session
# spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

spark = SparkSession.builder \
    .appName("MovieRecommendation") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# Read data from JSON file into a PySpark DataFrame :
movie_ratings_spark = spark.read.json('movies.json')

# Select relevant columns from the DataFrame :
movie_ratings_spark = movie_ratings_spark.select(user_rating)

# Create test and train set :
(training, test) = movie_ratings_spark.randomSplit([0.8, 0.2])
                                             
als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

model = als.fit(training)

# Evaluate the model by computing the Root Mean Squared Error (RMSE) on the test data :
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

user_id = 55
selected_user_df = movie_ratings_spark.filter(col('userId') == user_id)

user_recommendations = model.recommendForUserSubset(selected_user_df, 5)
user_recommendations.show(truncate=False)

model_path = "./best_model"

# Save the model
model.write().save(model_path)

# Load the model later
loaded_model = ALS.load(model_path)

Root Mean Squared Error (RMSE): 1.0837593838261739
+------+-----------------------------------------------------------------------------------------+
|userId|recommendations                                                                          |
+------+-----------------------------------------------------------------------------------------+
|55    |[{353, 8.178904}, {589, 8.003287}, {1022, 7.9188027}, {522, 7.9180393}, {466, 7.7695637}]|
+------+-----------------------------------------------------------------------------------------+



Py4JJavaError: An error occurred while calling o7369.load.
: java.lang.NoSuchMethodException: org.apache.spark.ml.recommendation.ALSModel.<init>(java.lang.String)
	at java.lang.Class.getConstructor0(Class.java:3082)
	at java.lang.Class.getConstructor(Class.java:1825)
	at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:468)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
