In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType,LongType, IntegerType

In [2]:
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()
movieSchema = StructType([ \
                     StructField("userId", IntegerType(), True), \
                         StructField("movieId", IntegerType(), True), \
                          StructField("rating", FloatType(), True), \
                             StructField("timestamp", LongType(), True)])


In [3]:
ratings = spark.read.schema(movieSchema).option("sep", "::").csv("./ratings.dat")

In [4]:
(training, testing) = ratings.randomSplit([0.6, 0.4])

In [5]:
als = ALS(maxIter=8, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

In [6]:
predictions = model.transform(testing)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Mean-Square-Error: {rmse * rmse}")

Mean-Square-Error: 0.8623139964250223
