In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import sys
sys.path.append('../src')

from database.mongo import Mongo
from pytorch_common.util import LoggerBuilder

In [5]:
LoggerBuilder().on_console().build()

1. Import data to **movies** mongodb database:

In [6]:
DATASETS_PATH   = '../datasets'
MOVIE_LENS_PATH = f'{DATASETS_PATH}/ml-25m'
TMDB_PATH       = f'{DATASETS_PATH}/tmdb'

In [9]:
MOVIE_LENS_FILES = [ f'{MOVIE_LENS_PATH}/{f}' for f in [ 
    'ratings.csv',
    'movies.csv',
    'links.csv',
    'tags.csv'
]]
TMDB_FILES = [ f'{TMDB_PATH}/movies_metadata.csv']

In [None]:
Mongo.import_csv(database = 'movies', TMDB_FILES)

In [15]:
Mongo.import_csv(database = 'movies', file_paths = TMDB_FILES)

2022-04-15 13:41:08,469 - ERROR - b'2022-04-15T13:41:07.070-0300\tconnected to: mongodb://localhost/\n2022-04-15T13:41:08.466-0300\t45466 document(s) imported successfully. 0 document(s) failed to import.\n'


2. Transform imdb id to number:

In [None]:
db.getCollection('movies_metadata').aggregate([
    {
        $match: { 
            $and: [
                { imdb_id: { $ne: "" } },
                { imdb_id: { $ne: 0 } }
            ]
        }
    },
    {
        $addFields: {
            imdb_id: {$toLong: [ { $arrayElemAt: [ { $split: ["$imdb_id", "tt"]}, 1 ] }] }
        }
    },
    { $out: "movies_metadata_v2" }
]);

3.  Add indexes to both links and movies_metadata_v2 collections:

In [None]:
db.getCollection('links').createIndex(
    { 'movieId': 1 }, 
    { unique: true, name: 'movieId_unique_index' }
)

db.getCollection('movies_metadata_v2').createIndex(
    { 'imdb_id': 1 }, 
    { unique: false, name: 'imdb_id_multiple_index' }
)

4. Add imdb features to movies collection:

In [None]:
db.getCollection('movies').aggregate([
    {
        $lookup:
          {
            from: "links",
            foreignField: "movieId",
            localField: "movieId", 
            as: "links"
          }
     },
     { $match: { links: { $exists: true, $not: {$size: 0} } } },
     { 
        $project: { 
            id: "$movieId",
            tmdb_id: { "$arrayElemAt": ["$links.tmdbId", 0] },
            imdb_id: { "$arrayElemAt": ["$links.imdbId", 0] },
            title: { $arrayElemAt: [ {$split:["$title","("]} ,  0 ] },
            release_year: { $arrayElemAt: [ {$split:["$title","("]} ,  1 ] },
            genres: { $split: [ "$genres", "|" ] }
        } 
    }, 
    {
        $lookup:
          {
            from: "movies_metadata_v2",
            foreignField: "imdb_id",
            localField: "imdb_id", 
            as: "movies_metadata"
          }
     },
     { $match: {  movies_metadata: { $exists: true, $not: {$size: 0} } } },
     { 
        $project: { 
            id: 1,
            tmdb_id: 1,
            imdb_id: 1,
            title: 1,
            genres: 1,
            for_adults: { "$arrayElemAt": ["$movies_metadata.adult", 0] },
            budget: { "$arrayElemAt": ["$movies_metadata.budget", 0] },
            original_language: { "$arrayElemAt": ["$movies_metadata.original_language", 0] },
            overview: { "$arrayElemAt": ["$movies_metadata.overview", 0] },
            poster: { "$arrayElemAt": ["$movies_metadata.poster_path", 0] },
            release: { "$arrayElemAt": ["$movies_metadata.release_date", 0] },
            popularity: { "$arrayElemAt": ["$movies_metadata.popularity", 0] },
            vote_mean: { "$arrayElemAt": ["$movies_metadata.vote_average", 0] },
            vote_count: { "$arrayElemAt": ["$movies_metadata.vote_count", 0] }
        }
    },
    { $out: "movies_v2" }
]);

5. Group tags per used, movie pair:

In [None]:
db.getCollection('tags').aggregate(
    [
        { 
            $group: {
                _id: {
                    user_id: "$userId",
                    movie_id: "$movieId"
                },
                tags: { $push: { $toLower: "$tag" } }
            }
        },
        {
          $project: {
            _id: 0,
            user_id: "$_id.user_id",
            movie_id: "$_id.movie_id",
            user_movie_id: { $concat: [ { $toString: "$_id.user_id" } , "_", { $toString:"$_id.movie_id"} ] },
            tags: 1
          }  
        },
        { $out: "tags_v2" }
    ]
);

6. Create used_movie_id into new tags_v2 collection:

In [None]:
db.getCollection('tags_v2').createIndex(
    { 'user_movie_id': 1 }, 
    { unique: true, name: 'id_unique_index' }
)

7. Add used_movie_id fields into new ratings_v2 collection and also create a unique index:

In [None]:
db.getCollection('ratings').aggregate([
        {
          $project: {
            user_id: "$userId",
            movie_id: "$movieId",
            user_movie_id: { $concat: [ { $toString: "$userId" } , "_", { $toString:"$movieId"} ] },
            rating: 1,
            timestamp: 1
          }  
        },
        { $out: "ratings_v2" }
    ]
);

In [None]:
db.getCollection('ratings_v2').createIndex(
    { 'user_movie_id': 1 }, 
    { unique: true, name: 'id_unique_index' }
)

8. Join ratting_v2 and tags_v2 collections by user_movie_id into a new ratings_tags_v1 collection:

In [None]:
db.getCollection('ratings_v2').aggregate([
    {
        $lookup:
          {
            from: "tags_v2",
            foreignField: "user_movie_id",
            localField: "user_movie_id", 
            as: "tags_v2"
          }
     },
     { $match: { tags_v2: { $exists: true, $not: {$size: 0} } } },
     { 
        $project: { 
            user_id: 1,
            movie_id: 1,
            rating: 1,
            timestamp: 1,
            tags: "$tags_v2.tags"
        }
    },
    {
        $addFields: {  
            _id: { $concat: [ { $toString: "$user_id" } , "_", { $toString:"$movie_id"} ] },            
            tags: {
                "$reduce": {
                    "input": "$tags",
                    "initialValue": [],
                    "in": { "$setUnion": [ "$$value", "$$this" ] }
                }
            }
        }
    },
    { $out: "ratings_tags_v1" }
]);

9. Add tags field into movie_v2 collections:

In [None]:
db.getCollection('movies_v2').createIndex(
    { 'id': 1 }, 
    { unique: true, name: 'id_unique_index' }
)
db.getCollection('tags_v2').createIndex(
    { 'movie_id': 1 }, 
    { unique: true, name: 'id_unique_index' }
)

In [None]:
db.getCollection('movies_v2').aggregate([
    {
        $lookup:
          {
            from: "tags_v2",
            foreignField: "movie_id",
            localField: "id",
            as: "tags_v2"
          }
    },
    { $match: { tags_v2: { $exists: true, $not: {$size: 0} } } },
    { 
        $addFields: { 
            tags: {
                "$reduce": {
                    "input": "$tags_v2.tags",
                    "initialValue": [],
                    "in": { "$setUnion": [ "$$value", "$$this" ] }
                }
            }
        }
    },
    { $unset: ["tags_v2"] },
    { $addFields: {  _id: "$id" } },
    { $unset: ["id"] },
    { $out: "movies_v3" }
]);

10. Export datasets

In [None]:
Mongo.export_to_json(database='movies', path=DATASETS_PATH, collections=['movies_v3'])

In [10]:
Mongo.export_to_json(database='movies', path=DATASETS_PATH, collections=['ratings_tags_v1'])

2022-04-16 11:21:33,739 - ERROR - b'2022-04-16T11:21:31.332-0300\tconnected to: mongodb://localhost/\n2022-04-16T11:21:32.333-0300\t[##########..............]  movies.ratings_tags_v1  88000/210725  (41.8%)\n2022-04-16T11:21:33.333-0300\t[####################....]  movies.ratings_tags_v1  176000/210725  (83.5%)\n2022-04-16T11:21:33.737-0300\t[########################]  movies.ratings_tags_v1  210725/210725  (100.0%)\n2022-04-16T11:21:33.737-0300\texported 210725 records\n'
