In [0]:
from pyspark.sql import SparkSession

# Connection session
connectionString = "mongodb://admin:nUOSRJ3kasFEspIb@sg-mycluster-44605.servers.mongodirector.com:27017/admin?authSource=admin&readPreference=primary&appname=MongoDB%20Compass&ssl=false"
spark = SparkSession\
    .builder\
    .config('spark.mongodb.input.uri', connectionString)\
    .config('spark.mongodb.output.uri', connectionString)\
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .getOrCreate()

# Reading from MongoDB
pipe_ratings = "{'$project': {'_id': 0, 'timestamp': 0}}"
ratingsDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", connectionString).option("database", "movielens").option("collection", "ratings_small").option("pipeline", pipe_ratings).load()
ratingsDF = ratingsDF.select('userId','movieId','rating')

pipe_ratings_complete = "{'$project': {'_id': 0, 'timestamp': 0}}"
ratings_completeDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", connectionString).option("database", "movielens").option("collection", "ratings").option("pipeline", pipe_ratings_complete).load()
ratings_completeDF = ratings_completeDF.select('userId','movieId','rating')

# DF to RDD
ratingsRDD = ratingsDF.rdd.map(lambda x: (x[0],x[1],x[2])).cache()   #remove timestamp
ratings_completeRDD = ratings_completeDF.rdd.map(lambda x: (x[0],x[1],x[2])).cache()  

print ("Numero ratings small caricati: %s" % (ratingsRDD.count()))
print ("Numero ratings completi caricati: %s" % (ratings_completeDF.count()))

In [0]:
################################TUNING DEGLI IPERPARAMETRI DI ALS (su dataset ratings small 100 K) ############################

from pyspark.mllib.recommendation import ALS
import math

# Splitting del dataset
trainingRDD_GT, validationRDD_GT, testRDD_GT = ratingsRDD.randomSplit([6, 2, 2])
validationRDD = validationRDD_GT.map(lambda x: (x[0], x[1]))
testRDD = testRDD_GT.map(lambda x: (x[0], x[1]))

iterations = [3, 5, 10]
reg_param = 0.1
ranks = [4, 8, 16]
min_error = float('inf')
best_rank = -1
best_iteration = -1

# Training della rete ALS
for iteration in iterations:
  for rank in ranks:
      model = ALS.train(trainingRDD_GT, rank, iterations=iteration, lambda_=reg_param)
      predictions = model.predictAll(validationRDD).map(lambda r: ((r[0], r[1]), r[2]))
      ratesPreds = validationRDD_GT.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
      error = math.sqrt(ratesPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
      print ('Rank %s, Iteration %s -> RMSE = %s' % (rank, iteration, error))
      if error < min_error:
          min_error = error
          best_rank = rank
          best_iteration = iteration

print ('Il modello migliore è stato addestrato con: rank %s e iteration %s' % (best_rank, best_iteration))

# Calcolo dell'errore di generalizzazione sul test set
training_validationRDD = sc.union([trainingRDD_GT, validationRDD_GT])
model = ALS.train(training_validationRDD, best_rank, best_iteration, lambda_=reg_param)
predictions = model.predictAll(testRDD).map(lambda r: ((r[0], r[1]), r[2]))
ratesPreds = testRDD_GT.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(ratesPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print ('Errore di generalizzazione sui dati di test utilizzando il modello migliore: RMSE = %s' % (error))

In [0]:
###################### TRAINING DEL MODELLO FINALE (sul dataset completo 27M) ###########################################
import os

# Splitting del dataset
completeTrainingRDD_GT, completeTestRDD_GT = ratings_completeRDD.randomSplit([7.5, 2.5])
completeTestRDD = completeTestRDD_GT.map(lambda x: (x[0], x[1]))

# Training del modello finale
complete_model = ALS.train(completeTrainingRDD_GT, best_rank, best_iteration, lambda_=reg_param)

# Calcolo dell'errore di generalizzazione sul test set
complete_predictions = complete_model.predictAll(completeTestRDD).map(lambda r: ((r[0], r[1]), r[2]))
complete_ratesPreds = completeTestRDD_GT.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(complete_predictions)
complete_error = math.sqrt(complete_ratesPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print ('Errore sui dati di test:  RMSE = %s' % (complete_error))

# Salvataggio in locale del modello addestrato 
model_path = os.path.join('/', 'models', 'movie_lens')
complete_model.save(sc, model_path)