# Models: Genres content based filtering

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH             = '../..'
LIB_PATH              = f'{BASE_PATH}/lib'
DATASET_PATH          = f'{BASE_PATH}/datasets'

In [3]:
import sys
sys.path.append(LIB_PATH)

import numpy as np
import pandas as pd

import data.dataset as ds

import util as ut

import recommender as rc

from database.chromadb import RepositoryFactory

import pytorch_common.util as pu

2023-10-07 17:11:36.147379: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-07 17:11:37.032403: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-07 17:11:37.043336: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

## Setup

In [5]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

## Carga de dataset

In [10]:
dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    DATASET_PATH,
    filter_fn = lambda df: df.query('user_movie_rating_year >= 2004')
)

In [11]:
dataset.data.groupby(['user_id'])['movie_id'] \
            .count() \
            .reset_index() \
            .sort_values('movie_id', ascending=False).head(3)

Unnamed: 0,user_id,movie_id
5019,62199,3688
520,6550,1935
5653,70092,1745


In [12]:
df = ut.embedding_from_list_col(
    dataset.data, 
    'user_id', 
    'movie_genres', 
    exclude=['(no genres listed)']
)

In [13]:
ut.save_df(df, f'{DATASET_PATH}/user-genre-freq-embedding.json')

In [14]:
repository = RepositoryFactory().create(
    name          = 'user-genre-freq-embedding',
    file_path     = f'{DATASET_PATH}/user-genre-freq-embedding.json',
    metadata_cols = ['user_id'],
    embedding_col = 'movie_genres_embedding',
    id_col        = 'user_id'
)


Insert Embeddings:   0%|          | 0/13017 [00:00<?, ?it/s]



In [15]:
recommender = rc.EmbCBFilteringRecommender('genres', repository, dataset)

In [16]:
recommender.recommend(6550).show()


Recommender: genres-cb-recommender



Unnamed: 0,Rating,Movies,movie_genres,movie_release_year
0,5.0,,"[Action, Sci-Fi]",2017
93,5.0,,[Comedy],2014
98,5.0,,"[Adventure, Animation, Drama]",2014
117,5.0,,"[Action, Adventure, Thriller, IMAX]",2012
222,5.0,,"[Animation, Children, Comedy]",2012


In [17]:
recommender.recommend(62199).show()


Recommender: genres-cb-recommender



Unnamed: 0,Rating,Movies,movie_genres,movie_release_year
0,5.0,,"[Action, Drama, War]",1967
20,5.0,,"[Comedy, Drama]",1967
23,5.0,,"[Action, Adventure]",1961
65,5.0,,[Drama],1957
76,5.0,,[Drama],1952


In [18]:
recommender.recommend(70092).show()


Recommender: genres-cb-recommender



Unnamed: 0,Rating,Movies,movie_genres,movie_release_year
0,5.0,,"[Comedy, Documentary]",2005
26,5.0,,[Documentary],2003
29,5.0,,"[Drama, Fantasy, Romance]",1987
62,5.0,,"[Drama, War]",1985
99,5.0,,"[Drama, Fantasy, Mystery]",1982
