# Model: Genres content based

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH             = '../..'
LIB_PATH              = f'{BASE_PATH}/lib'
DATASET_PATH          = f'{BASE_PATH}/datasets'

In [3]:
import sys
sys.path.append(LIB_PATH)

import numpy as np
import pandas as pd

import data.dataset as ds

import util as ut

import recommender as rc

from database.chromadb import RepositoryFactory

import pytorch_common.util as pu

2023-10-21 14:25:22.897153: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-21 14:25:23.737425: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-21 14:25:23.750805: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pd.set_option('max_colwidth', 400)

In [5]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

## Carga de dataset

In [6]:
dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    DATASET_PATH,
    filter_fn = lambda df: df.query('user_movie_rating_year >= 2004')
)

In [7]:
import logging

class ListTypeColumnContenBasedRecommender:
    def __init__(
        self,
        user_id_col  = 'user_id',
        item_id_col  = 'movie_id',
        emb_col      = 'movie_genres',
        exclude      = ['(no genres listed)']
    ):
        self.__user_id_col      = user_id_col
        self.__item_id_col      = item_id_col
        self.__exclude          = exclude
        self.__source_emb_col   = emb_col
        self.__item_features_df = None


    def fit(self, df):
        self.__user_features_df = ut.embedding_from_list_col(df,   self.__user_id_col, self.__source_emb_col, self.__exclude, as_list=False)
        logging.info(f'User profiles fitted...{self.__user_features_df.shape}')
        
        if self.__item_features_df is None:
            self.__item_features_df = ut.get_one_hot_from_list_col(df, self.__item_id_col, self.__source_emb_col, self.__exclude, as_list=False)
            self.__target_emb_col   = list(set(self.__item_features_df.columns) - set([self.__item_id_col]))
            logging.info(f'Item profiles fitted...{self.__item_features_df.shape}')

        return self


    def __user_emb(self, user_id):
        return self.__user_features_df[self.__user_features_df[self.__user_id_col] == user_id]

    
    def __score(self, result_df, user_id, sort):
        result_df['score'] = result_df[self.__target_emb_col].sum(axis=1)
        result_df = result_df[[self.__item_id_col, 'score']]
        result_df.insert(0, self.__user_id_col, user_id)
        result_df = result_df[result_df['score'] > 0]
        return result_df.sort_values(['score'], ascending=False) if sort else result_df


    def recommend(self, user_id, k=10):
        user_emb = self.__user_emb(user_id)

        if user_emb.shape[0] == 0:
            logging.warning(f'Not found user profile for {user_id} user id.')
            return pd.DataFrame(columns=[self.__user_id_col, self.__item_id_col, 'score'])

        result_df = self.__item_features_df.copy()
        for c in self.__target_emb_col:
            result_df[c] = result_df[c].apply(lambda x: x *user_emb[c].values[0] )

        result_df = self.__score(result_df, user_id, k is not None)

        return result_df.head(k) if k else result_df

In [8]:
model = ListTypeColumnContenBasedRecommender(
    user_id_col  = 'user_id',
    item_id_col  = 'movie_id',
    emb_col      = 'movie_genres',
    exclude      = ['(no genres listed)']
)

model.fit(dataset.data)

2023-10-21 14:25:51,524 - INFO - User profiles fitted...(13000, 20)
2023-10-21 14:25:57,177 - INFO - Item profiles fitted...(18608, 20)


<__main__.ListTypeColumnContenBasedRecommender at 0x7f63a8686ef0>

In [11]:
model.recommend(user_id=62199)

Unnamed: 0,user_id,movie_id,score
104132,62199,81132,0.761548
160430,62199,83266,0.704128
154303,62199,4719,0.700035
166424,62199,64645,0.680505
165050,62199,4956,0.675126
179047,62199,27781,0.665068
69006,62199,7235,0.66156
185795,62199,117646,0.649164
166782,62199,6902,0.629634
141348,62199,27344,0.629166


In [12]:
def concat_user_recs(users_rec):
    result = {}
    
    for user_rec in users_rec:
        for key in user_rec.keys():
            if key in result:
                result[key].extend(user_rec[key])
            else:
                result[key] = user_rec[key]

    return result

In [13]:
def rec_fn(user_id):
    return model.recommend(user_id, k=None).to_dict('list')

In [14]:
parallel = ut.ParallelExecutor()

result = parallel(
    rec_fn, 
    params          = [[u] for u in dataset.data.user_id.unique()],
    fallback_result = {}
)

2023-10-21 14:37:46,259 - INFO - 0:11:14.71


In [15]:
result_df = pd.DataFrame.from_dict(concat_user_recs(result))
result_df

Unnamed: 0,user_id,movie_id,score
0,791,1,0.731343
1,791,2355,0.604478
2,791,3114,0.731343
3,791,4306,0.768657
4,791,4886,0.731343
...,...,...,...
157227159,74712,171917,0.500000
157227160,74712,167360,0.500000
157227161,74712,174399,0.500000
157227162,74712,174505,0.500000


In [16]:
ut.save_df(result_df, f'{DATASET_PATH}/cb-genre-result.json')