# Recommender ChatBot: Collaborative Filtering recommender using user embeddings from chromadb

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH    = '../..'
LIB_PATH     = f'{BASE_PATH}/lib'
API_PATH     = f'{BASE_PATH}/chat-bot-api'

In [3]:
import sys
sys.path.append(LIB_PATH)
sys.path.append(API_PATH)


import torch
import pytorch_common
import pytorch_common.util as pu
import util as ut
import os
import numpy as np
import logging

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
import pytorch_common
pytorch_common.__version__

'0.3.8'

In [6]:
torch.__version__

'2.1.2+cu118'

In [7]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available()

(device(type='cuda', index=0), True)

In [8]:
ut.set_seed(42)

In [9]:
os.environ['TMP_PATH']         = f'{BASE_PATH}/tmp'
os.environ['DATASET_PATH']     = f'{BASE_PATH}/datasets'
os.environ['WEIGHTS_PATH']     = f'{BASE_PATH}/weights'
os.environ['METRICS_PATH']     = f'{BASE_PATH}/metrics'
os.environ['MONGODB_URL']      = 'mongodb://0.0.0.0:27017'
os.environ['MONGODB_DATABASE'] = 'chatbot'
os.environ['CHROMA_HOST']      = '0.0.0.0'
os.environ['CHROMA_PORT']      = '9090'


from app_context import AppContext

2024-02-07 20:51:34.367960: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-07 20:51:35.151907: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-07 20:51:35.163035: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

## Recommender

In [10]:
ctx = AppContext()

2024-02-07 20:51:38,660 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2024-02-07 20:51:39,209 - INFO - Use pytorch device: cuda
2024-02-07 20:51:39,212 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-02-07 20:51:39,231 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [15]:
class DatabaseUserItemFilteringRecommender:
    def __init__(
        self,
        user_emb_repository,
        items_repository,
        interactions_repository,
        k_sim_users = 10
    ):
        self.__user_emb_repository     = user_emb_repository
        self.__items_repository        = items_repository
        self.__interactions_repository = interactions_repository
        self.__k_sim_users             = k_sim_users

        
    def __score(self, df, similar_users_result):
        user_distances = {id: similar_users_result.distances[idx] for idx, id in enumerate(similar_users_result.ids)}

        recommendations = df.copy()
        recommendations['score'] = recommendations[self.__user_id_col] \
            .apply(lambda r:  (1 - user_distances[r]))  * df[self.__rating_col]

        recommendations = ut.year_to_decade(recommendations, self.__release_year_col, 'decade')

        return recommendations \
            .groupby([self.__item_id_col, 'decade'])['score'] \
            .mean() \
            .reset_index() \
            .sort_values(['decade', 'score'], ascending=False)


    def __users_similarity(self, similar_users):
        user_sim = {}
        max_distance = np.max(similar_users.distances)
        for idx in range(len(similar_users.str_ids)):
            user_sim[similar_users.str_ids[idx]] = 1 - (similar_users.distances[idx]/max_distance)
        return user_sim


    async def recommend(
        self,
        user_id: int = None,
        k      : int = 100
    ):
        similar_users = self.__user_emb_repository.find_similars_by_id(user_id, limit=self.__k_sim_users)
        
        if similar_users.empty: return []

        similar_user_ids = [id for id in similar_users.str_ids if id != user_id]

        if len(similar_user_ids) ==0:
            logging.warning('Not found similar users')
            return []

        interactions = await self.__interactions_repository.find_many_by(user_id={'$in': similar_user_ids})
        
        if len(interactions) ==0:
            logging.warning('Not found sumilar users interactions')
            return []
        
        item_ids = np.unique([i.item_id for i in interactions]).tolist()
        
        if len(item_ids) ==0:
            logging.warning('Not found items')
            return []

        items = await self.__items_repository.find_many_by(item_id={'$in': item_ids})

        
        sim_by_user_id = self.__users_similarity(similar_users)
        
        sim_by_item_id = {i.item_id:sim_by_user_id[i.user_id] for i in interactions}
        
        max_rating     = np.max([item.rating for item in items])

        scored_items = []
        for item in items:
            rating_score = item.rating / max_rating
            item_score = rating_score * (sim_by_item_id[item.id] * 0.5)
            scored_items.append((item_score, item))

        ordered_items = sorted(scored_items, key=lambda x: x[0], reverse=True)

        return [{'id': item[1].id, 'score': item[0], 'rating': item[1].rating, 'title': item[1].title, 'poster': item[1].poster} for item in scored_items[:k]]

In [16]:
recommender = DatabaseUserItemFilteringRecommender(
    user_emb_repository     = ctx.users_cf_emb_repository,
    items_repository        = ctx.items_repository,
    interactions_repository = ctx.interactions_repository,
)

In [17]:
await recommender.recommend(user_id="adrianmarino@gmail.com")

[{'id': '102481',
  'score': 0.07789426138198585,
  'rating': 3.125,
  'title': 'Internship, The',
  'poster': 'http://image.tmdb.org/t/p/w500/qX3AuVJrx8flXjYnzeFUJtPfg4o.jpg'},
 {'id': '140110',
  'score': 0.09568430559390534,
  'rating': 3.8387096774,
  'title': 'The Intern',
  'poster': 'http://image.tmdb.org/t/p/w500/9UoAC9tu8kIyRy8AcJnGhnH0gOH.jpg'},
 {'id': '1887',
  'score': 0.003264470426713309,
  'rating': 3.5,
  'title': 'Almost Heroes',
  'poster': 'http://image.tmdb.org/t/p/w500/qO1cfr4UxcwQ858Nxp470QNS3v8.jpg'},
 {'id': '3355',
  'score': 0.0,
  'rating': 3.2272727273,
  'title': 'Ninth Gate, The',
  'poster': 'http://image.tmdb.org/t/p/w500/iFDtmA8Bg9zE4NcsjcmGfqFN01H.jpg'},
 {'id': '49272',
  'score': 0.09616417244937328,
  'rating': 3.7933333333,
  'title': 'Casino Royale',
  'poster': 'http://image.tmdb.org/t/p/w500/lMrxYKKhd4lqRzwUHAy5gcx9PSO.jpg'},
 {'id': '55768',
  'score': 0.0722498835192765,
  'rating': 2.85,
  'title': 'Bee Movie',
  'poster': 'http://image.tmdb