In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sc.version

'3.3.0'

In [2]:
from pyspark.mllib.recommendation import Rating
from pyspark.mllib.recommendation import ALS

In [8]:
# Load the data into RDD
data = sc.textFile('ml_ratings.txt')
# Split the RDD 
ratings = data.map(lambda l: l.split('::'))

In [9]:
# Transform the ratings RDD 
ratings_final = ratings.map(lambda line: Rating(int(line[0]), int(line[1]), float(line[2])))

# Split the data into training and test
training_data, test_data = ratings_final.randomSplit([0.8, 0.2], seed=42)

In [10]:
# Create the ALS model on the training data
model = ALS.train(training_data, rank=5, iterations=5)

# Drop the ratings column 
testdata_no_rating = test_data.map(lambda p: (p[0], p[1]))

# Predict the model  
predictions = model.predictAll(testdata_no_rating)

In [11]:

# Return the first 5 rows of the RDD
predictions.take(5)

[Rating(user=12, product=84, rating=1.456569179183237),
 Rating(user=20, product=84, rating=0.8070410357049052),
 Rating(user=6, product=84, rating=0.9351724925626016),
 Rating(user=24, product=96, rating=3.217497187079341),
 Rating(user=12, product=96, rating=-0.91870471898241)]

In [12]:
# Prepare ratings data
rates = ratings_final.map(lambda r: ((r[0], r[1]), r[2]))

# Prepare predictions data
preds = predictions.map(lambda r: ((r[0], r[1]), r[2]))

# Join the ratings data with predictions data
rates_and_preds = rates.join(preds)

# Calculate and print MSE
MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error of the model for the test data = {:.2f}".format(MSE))

Mean Squared Error of the model for the test data = 2.29
