In [0]:
###################### IMPORT DEI DATI ###########################

import pymongo
import pandas as pd
from pyspark.sql import SparkSession

# Open connection
client = pymongo.MongoClient('mongodb://admin:nUOSRJ3kasFEspIb@sg-mycluster-44605.servers.mongodirector.com', 27017)
db = client['movielens']

# Path dei dati
csvMovies = "dbfs:/FileStore/tables/movielens/movies.csv"
csvRatings = "dbfs:/FileStore/tables/movielens/ratings.csv"
csvRatingsSmall = "dbfs:/FileStore/tables/movielens/ratings_small.csv"
csvTags = "dbfs:/FileStore/tables/movielens/tags.csv"
csvGenomeScores = "dbfs:/FileStore/tables/movielens/genome_scores.csv"
csvGenomeTags = "dbfs:/FileStore/tables/movielens/genome_tags.csv"

# Creazione delle collection
movies_collection = db.movies
ratings_collection = db.ratings
ratings_small_collection = db.ratings_small
tags_collection = db.tags
genome_scores_collection = db.genome_scores
genome_tags_collection = db.genome_tags

# Caricamento dei dati negli RDD
datiMoviesRDD = spark.read.option("inferSchema", "True").option("header", "True").csv(csvMovies).rdd
datiRatingsSmallRDD = spark.read.option("inferSchema", "True").option("header", "True").csv(csvRatings).rdd
datiRatingsRDD = spark.read.option("inferSchema", "True").option("header", "True").csv(csvRatingsSmall).rdd
datiTagsRDD = spark.read.option("inferSchema", "True").option("header", "True").csv(csvTags).rdd
datiGenomeScoresRDD = spark.read.option("inferSchema", "True").option("header", "True").csv(csvGenomeScores).rdd
datiGenomeTagsRDD = spark.read.option("inferSchema", "True").option("header", "True").csv(csvGenomeTags).rdd

# Data manipulation
datiMoviesRDD = datiMoviesRDD.map(lambda x: (str(x[0]),x[1],x[2].split("|")))

# Load into MongoDB
movieKeys = ["movieId", "title", "genres"]
ratingsKeys = ["userId", "movieId", "rating", "timestamp"]
tagsKeys = ["userId", "movieId", "tag", "timestamp"]
genomeScoresKeys = ["movieId","tagId","relevance"]
genomeTagsKeys = ["tagId","tag"]

moviesList = []
ratingsList = []
ratingsSmallList = []
tagsList = []
genomeScoresList = []
genomeTagsList = []

for lista in datiMoviesRDD.collect():
  yourdic = dict(zip(movieKeys, lista))
  moviesList.append(yourdic)

for lista in datiRatingsRDD.collect():
  yourdic = dict(zip(ratingsKeys, lista))
  ratingsList.append(yourdic)
  
for lista in datiRatingsSmallRDD.collect():
  yourdic = dict(zip(ratingsKeys, lista))
  ratingsSmallList.append(yourdic)
  
for lista in datiTagsRDD.collect():
  yourdic = dict(zip(tagsKeys, lista))
  tagsList.append(yourdic)
  
for lista in datiGenomeScoresRDD.collect():
  yourdic = dict(zip(tagsKeys, lista))
  genomeScoresList.append(yourdic)
  
for lista in datiGenomeTagsRDD.collect():
  yourdic = dict(zip(tagsKeys, lista))
  genomeTagsList.append(yourdic)
 
 
movies_collection.insert_many(moviesList)
ratings_collection.insert_many(ratingsList)
ratings_small_collection.insert_many(ratingsSmallList)
tags_collection.insert_many(tagsList)
genome_scores_collection.insert_many(genomeScoresList)
genome_tags_collection.insert_many(genomeTagsList)

In [0]:
############################# DATA MODELING ####################################
import pymongo

client = pymongo.MongoClient('mongodb://admin:nUOSRJ3kasFEspIb@sg-mycluster-44605.servers.mongodirector.com', 27017)
db = client['movielens']

# Embedd tags into movie documents
db.movies.aggregate([
        {
           '$lookup':
           {
             'from': 'tags',
             'localField': 'movieId',
             'foreignField': 'movieId',
             'as': 'tags'
           }
        },{
          '$project': {
            'tags.movieId': 0
          }
        }, {
          '$out': "movies_2"
        }
    ]);

# Add tagId and relevance fields to tags collection
result = db.tags.aggregate([
      {"$addFields": {"tag": { "$toLower": "$tag" }}},
      {"$lookup": {
         'from': 'genome_tags',
         'localField': 'tag',
         'foreignField': 'tag',
         'as': 'tag_doc'
      }},
      {"$addFields": {"tagId": {"$arrayElemAt": [ "$tag_doc.tagId", 0 ] }}},
      {"$project": {"tag_doc": 0}},
      {"$lookup": {
         'from': 'genome_scores',
         'let' : {'movieId': '$movieId', 'tagId':'$tagId'},
         'pipeline': [
                     { "$match" : 
                      { "$expr" : 
                       { "$and" : [ 
                          { "$eq": ["$movieId", "$$movieId"] },
                          {"$eq": ["$tagId", "$$tagId"]}
                        ] 
                       } 
                      } 
                     }
                ],
          'as': 'relevance'
          }
      },
      {"$addFields": {"relevance": {"$arrayElemAt": [ "$relevance.relevance", 0 ] }}},
      {"$out": "tags_relevance"}
])