# Recommender ChatBot: Collaborative Filtering recommender using user embeddings from chromadb

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH    = '../..'
LIB_PATH     = f'{BASE_PATH}/lib'
API_PATH     = f'{BASE_PATH}/chat-bot-api'

In [3]:
import sys
sys.path.append(LIB_PATH)
sys.path.append(API_PATH)


import torch
import pytorch_common
import pytorch_common.util as pu
import util as ut
import os
import numpy as np
import logging

import pandas as pd
from IPython.core.display import HTML
from recommender import RecommenderResult, to_image_html
import random

2024-02-18 22:03:29.912305: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-18 22:03:30.681097: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-18 22:03:30.691584: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
import pytorch_common
pytorch_common.__version__

'0.3.8'

In [6]:
torch.__version__

'2.1.2+cu118'

In [7]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available()

(device(type='cuda', index=0), True)

In [8]:
ut.set_seed(42)

In [9]:
os.environ['TMP_PATH']         = f'{BASE_PATH}/tmp'
os.environ['DATASET_PATH']     = f'{BASE_PATH}/datasets'
os.environ['WEIGHTS_PATH']     = f'{BASE_PATH}/weights'
os.environ['METRICS_PATH']     = f'{BASE_PATH}/metrics'
os.environ['MONGODB_URL']      = 'mongodb://0.0.0.0:27017'
os.environ['MONGODB_DATABASE'] = 'chatbot'
os.environ['CHROMA_HOST']      = '0.0.0.0'
os.environ['CHROMA_PORT']      = '9090'


from app_context import AppContext

## Recommender

In [10]:
ctx = AppContext()

2024-02-18 22:03:32,376 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2024-02-18 22:03:32,943 - INFO - Use pytorch device: cuda
2024-02-18 22:03:32,944 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-02-18 22:03:32,964 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [11]:
class DatabaseUserItemFilteringRecommenderResult(RecommenderResult):
    def __init__(
        self,
        recommender_name,
        data,
        seen_items
    ):
        self.__recommender_name = recommender_name
        self.__data             = data
        self.__seen_items       = seen_items


    @property
    def data(self): return self.__data

    @property
    def seen(self): return self.__seen_items

    def show(
        self, 
        k              = 5,
        sort_by        = ['pred_user_rating', 'user_distance_weighted_rating_score'],
        sort_ascending = False,
        image_width    = 150, 
    ):
        print(f'\nRecommender: {self.__recommender_name}\n')

        if self.__data is None:
            print('Not Found recommendations!')
            return


        df = self.__data.sort_values(by=sort_by, ascending=sort_ascending)[:k]

        
        print(pd.Series(np.concatenate(df['genres'].apply(np.array).values)).value_counts())

        
        df['image'] = df.apply(lambda row: to_image_html(row['poster'], width=image_width, alt=row['title']), axis=1)

        df = df[['user_sim_weighted_rating_score', 'user_sim_weighted_pred_rating_score', 'pred_user_rating', 'user_item_sim', 'rating', 'image', 'genres']]

        df = df.rename(columns={
            'user_sim_weighted_rating_score' : 'User sim weighted rating score',
            'user_sim_weighted_rating_score' : 'User sim weighted pred rating score',
            'pred_user_rating': 'Predicted User Rating',
            'user_item_sim': 'User Item Similarity',
            'norm_mean_rating': 'Norm Mean Rating',
            'rating' : 'Mean Rating',
            'image'  : 'Movies',
            'genres' : 'Genres'
        })

        df = df.reset_index()    
            
        display(HTML(df.to_html(escape=False)))
        
        
    def show_seen(
        self,
        k              = 1000,
        sort_by        = ['rating'],
        sort_ascending = False,
        image_width    = 150, 
    ):
        print(f'\nUser Seen items\n')

        if self.__seen_items is None:
            print('Not Found items!')
            return
        
        df = self.__seen_items.sort_values(by=sort_by, ascending=sort_ascending)[:k]

        print(pd.Series(np.concatenate(df['genres'].apply(np.array).values)).value_counts())

        df['image'] = df.apply(lambda row: to_image_html(row['poster'], width=image_width, alt=row['title']), axis=1)

        df = df[['rating', 'image', 'genres']] 


        df = df.rename(columns={
            'rating' : 'User Rating',
            'image'  : 'Movies',
            'genres' : 'Genres'
        })
        
        df = df.reset_index()

        display(HTML(df.to_html(escape=False)))

In [12]:
class DatabaseUserItemFilteringRecommender:
    def __init__(
        self,
        user_emb_repository,
        items_repository,
        interactions_repository,
        pred_interactions_repository
    ):
        self.__user_emb_repository          = user_emb_repository
        self.__items_repository             = items_repository
        self.__interactions_repository      = interactions_repository
        self.__pred_interactions_repository = pred_interactions_repository


    def __users_distance(self, similar_users):
        return { similar_users.str_ids[idx]: similar_users.distances[idx] for idx in range(len(similar_users.str_ids)) }            

    
    
    def __select_interactions(self, interactions, percent, max_items_by_user):
        interactions_by_user_id = {}
        for i in interactions:
            if i.user_id not in interactions_by_user_id:                
                interactions_by_user_id[i.user_id] = []

            inters = interactions_by_user_id[i.user_id]
                
            if random.random() >= percent and len(inters) < max_items_by_user:
                inters.append(i)
    
        interactions = []
        for inters in interactions_by_user_id.values():
            interactions.extend(inters)
        
        return interactions
    
    
    async def recommend(
        self,
        user_id                        : int  = None,
        not_seen                       : bool = True,
        k_sim_users                    : int = 5,
        random_selection_items_by_user : int = 0.5,
        max_items_by_user              : int = 5
    ):
        similar_users = self.__user_emb_repository.find_similars_by_id(user_id, limit=k_sim_users)
        
        if similar_users.empty: return []

        similar_user_ids = [id for id in similar_users.str_ids if id != user_id]

        if len(similar_user_ids) ==0:
            logging.warning('Not found similar users')
            return []

        interactions = await self.__interactions_repository.find_many_by(user_id={'$in': similar_user_ids})
        
        interactions = self.__select_interactions(
            interactions,
            percent           = random_selection_items_by_user,
            max_items_by_user = max_items_by_user
        )
    
        if len(interactions) ==0:
            logging.warning('Not found sumilar users interactions')
            return []
        
        item_ids = np.unique([i.item_id for i in interactions]).tolist()
        
        if len(item_ids) ==0:
            logging.warning('Not found items')
            return []

        user_interactions = await self.__interactions_repository.find_many_by(user_id=user_id)
 
        if not_seen:
            seen_item_ids = [i.item_id for i in user_interactions]
            item_ids = [item_id for item_id in item_ids if item_id not in seen_item_ids]

        
        items = await self.__items_repository.find_many_by(item_id={'$in': item_ids})
    
        pred_interactions =  await self.__pred_interactions_repository.find_many_by(
            user_id=user_id,
            item_id={'$in': item_ids}
        )
        pred_rating_by_item_id = {i.item_id: i.rating for i in pred_interactions}
        
        
        distance_by_user_id = self.__users_distance(similar_users)
        
        distance_by_item_id = {i.item_id:distance_by_user_id[i.user_id] for i in interactions}
        
        max_rating      = np.max([item.rating for item in items])
        
        max_pred_rating = np.max(list(pred_rating_by_item_id.values()))


        scored_items  = []
        for item in items:
            item_sim   = (1 - distance_by_item_id[item.id])

            norm_rating = item.rating / max_rating

            item_score1  = norm_rating * item_sim
            
            norm_pred_rating = pred_rating_by_item_id.get(item.id, 0) / max_pred_rating
                
            item_score2  = norm_pred_rating * item_sim
            
            scored_items.append((item, item_score1, item_score2, item_sim, norm_rating))

    
        recommended_items = pd.DataFrame([
            {
                'id'    : item[0].id,
                'user_sim_weighted_rating_score'      : item[1],
                'user_sim_weighted_pred_rating_score' : item[2],
                'user_item_sim'                       : item[3],
                'pred_user_rating'                    : pred_rating_by_item_id.get(item[0].id, 0),
                'rating': item[0].rating,
                'title' : item[0].title,
                'poster': item[0].poster,
                'genres': item[0].genres
            }
            for item in scored_items
        ])
        
        
        seen_item_rating_by_id = {i.item_id: i.rating for i in user_interactions}
        seen_items = await self.__items_repository.find_many_by(
            item_id={'$in': list(seen_item_rating_by_id.keys())}
        )
   
        seen_items = pd.DataFrame([
            {
                'id'    : item.id,
                'rating': seen_item_rating_by_id.get(item.id, 0),
                'title' : item.title,
                'poster': item.poster,
                'genres': item.genres
            }
            for item in seen_items
        ])
        
        return DatabaseUserItemFilteringRecommenderResult(
            self.__class__.__name__,
            recommended_items,
            seen_items
        )

# Pendiente

- Agregra fun users por generos.

In [13]:
recommender = DatabaseUserItemFilteringRecommender(
    user_emb_repository          = ctx.users_cf_emb_repository,
    items_repository             = ctx.items_repository,
    interactions_repository      = ctx.interactions_repository,
    pred_interactions_repository = ctx.pred_interactions_repository,

)

recommendations = await recommender.recommend(
    user_id           = "adrianmarino@gmail.com",
    k_sim_users       = 20,
    max_items_by_user = 5
)

In [14]:
recommendations.show_seen()

Unnamed: 0,index,User Rating,Movies,Genres
0,0,5.0,,"[adventure, animation, children, comedy, fantasy]"
1,11,5.0,,"[action, adventure, sci-fi]"
2,1,5.0,,"[action, sci-fi, thriller, imax]"
3,24,5.0,,"[adventure, animation, children, comedy, fantasy]"
4,16,5.0,,"[action, adventure, animation, children, comedy]"
5,26,5.0,,"[drama, mystery, sci-fi, thriller]"
6,14,5.0,,"[adventure, animation, children, comedy, drama, fantasy]"
7,13,5.0,,"[action, adventure, sci-fi]"
8,27,5.0,,"[adventure, animation, children, comedy, fantasy]"
9,22,5.0,,"[action, adventure]"


In [18]:
recommendations.show(
    sort_by        = ['user_sim_weighted_pred_rating_score'],
    k              = 10
)

Unnamed: 0,index,User sim weighted pred rating score,user_sim_weighted_pred_rating_score,Predicted User Rating,User Item Similarity,Mean Rating,Movies,Genres
0,70,0.868326,0.989232,4.806043,0.989232,4.388889,,"[action, crime, drama, thriller]"
1,65,0.844945,0.986385,4.794106,0.988841,4.272401,,"[action, sci-fi, thriller]"
2,16,0.801325,0.983689,4.772671,0.990567,4.044776,,[comedy]
3,41,0.810956,0.957789,4.655716,0.988715,4.101064,,"[adventure, comedy, sci-fi]"
4,58,0.866358,0.942234,4.57359,0.990123,4.375,,[drama]
5,63,0.79157,0.940116,4.566357,0.989462,4.0,,[drama]
6,72,0.834125,0.931418,4.526951,0.988841,4.217687,,"[action, adventure, animation, drama, fantasy]"
7,7,0.787737,0.929738,4.510912,0.990567,3.97619,,[drama]
8,32,0.817596,0.929632,4.516483,0.989232,4.132479,,"[horror, sci-fi]"
9,34,0.789877,0.927072,4.503552,0.98934,3.991935,,"[crime, horror]"


In [19]:
recommendations.show(
    sort_by        = ['user_sim_weighted_rating_score'],
    k              = 10
)

Unnamed: 0,index,User sim weighted pred rating score,user_sim_weighted_pred_rating_score,Predicted User Rating,User Item Similarity,Mean Rating,Movies,Genres
0,18,0.988815,0.862319,4.19122,0.988815,5.0,,"[action, adventure, drama, fantasy]"
1,70,0.868326,0.989232,4.806043,0.989232,4.388889,,"[action, crime, drama, thriller]"
2,58,0.866358,0.942234,4.57359,0.990123,4.375,,[drama]
3,65,0.844945,0.986385,4.794106,0.988841,4.272401,,"[action, sci-fi, thriller]"
4,13,0.840493,0.92272,4.484792,0.988815,4.25,,"[comedy, drama, war]"
5,15,0.835288,0.918604,4.466183,0.988506,4.225,,"[drama, horror, romance]"
6,72,0.834125,0.931418,4.526951,0.988841,4.217687,,"[action, adventure, animation, drama, fantasy]"
7,67,0.817696,0.890232,4.326774,0.988841,4.134615,,"[adventure, animation, children, drama, sci-fi]"
8,32,0.817596,0.929632,4.516483,0.989232,4.132479,,"[horror, sci-fi]"
9,31,0.812731,0.91549,4.449297,0.988894,4.10929,,[drama]


In [20]:
recommendations.show(
    sort_by        = ['user_item_sim'],
    k              = 10
)

Unnamed: 0,index,User sim weighted pred rating score,user_sim_weighted_pred_rating_score,Predicted User Rating,User Item Similarity,Mean Rating,Movies,Genres
0,16,0.801325,0.983689,4.772671,0.990567,4.044776,,[comedy]
1,10,0.773126,0.88033,4.271195,0.990567,3.902439,,"[drama, romance, sci-fi]"
2,21,0.764152,0.818355,3.970502,0.990567,3.857143,,[thriller]
3,7,0.787737,0.929738,4.510912,0.990567,3.97619,,[drama]
4,40,0.707231,0.78584,3.814456,0.990123,3.571429,,"[animation, children, fantasy, musical]"
5,50,0.760159,0.877912,4.26137,0.990123,3.83871,,[comedy]
6,58,0.866358,0.942234,4.57359,0.990123,4.375,,[drama]
7,22,0.793984,0.889749,4.318829,0.990123,4.009524,,"[action, sci-fi, imax]"
8,36,0.630078,0.863252,4.19021,0.990123,3.181818,,[(no genres listed)]
9,77,0.746727,0.83085,4.035209,0.989565,3.773006,,"[drama, romance]"
