# Recommender ChatBot: Collaborative Filtering recommender using user embeddings from chromadb

In [3]:
%load_ext autoreload
%autoreload 2

In [1]:
BASE_PATH    = '../..'
LIB_PATH     = f'{BASE_PATH}/lib'
API_PATH     = f'{BASE_PATH}/chat-bot-api'

In [2]:
import sys
sys.path.append(LIB_PATH)
sys.path.append(API_PATH)


import torch
import pytorch_common
import pytorch_common.util as pu
import util as ut
import os
import numpy as np
import logging

import pandas as pd
from IPython.core.display import HTML
from recommender import RecommenderResult, to_image_html
import random

2024-02-22 18:46:42.374746: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-22 18:46:43.142345: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-22 18:46:43.153529: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
import pytorch_common
pytorch_common.__version__

'0.3.8'

In [6]:
torch.__version__

'2.1.2+cu118'

In [7]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available()

(device(type='cuda', index=0), True)

In [8]:
ut.set_seed(42)

In [9]:
os.environ['TMP_PATH']         = f'{BASE_PATH}/tmp'
os.environ['DATASET_PATH']     = f'{BASE_PATH}/datasets'
os.environ['WEIGHTS_PATH']     = f'{BASE_PATH}/weights'
os.environ['METRICS_PATH']     = f'{BASE_PATH}/metrics'
os.environ['MONGODB_URL']      = 'mongodb://0.0.0.0:27017'
os.environ['MONGODB_DATABASE'] = 'chatbot'
os.environ['CHROMA_HOST']      = '0.0.0.0'
os.environ['CHROMA_PORT']      = '9090'


from app_context import AppContext

## Recommender

In [10]:
ctx = AppContext()

2024-02-22 18:46:48,354 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2024-02-22 18:46:48,935 - INFO - Use pytorch device: cuda
2024-02-22 18:46:48,937 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-02-22 18:46:48,957 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [335]:
class DatabaseUserItemFilteringRecommenderResult(RecommenderResult):
    def __init__(
        self,
        recommender_name,
        data,
        seen_items
    ):
        self.__recommender_name = recommender_name
        self.__data             = data
        self.__seen_items       = seen_items


    @property
    def data(self): return self.__data

    @property
    def seen(self): return self.__seen_items

    def show(
        self, 
        k              = 5,
        sort_by        = ['pred_user_rating', 'user_distance_weighted_rating_score'],
        sort_ascending = False,
        image_width    = 340,
        transpose      = True
    ):
        display(HTML(f'<h2>Recommender: {self.__recommender_name}</h2>'))
      
        if self.__data is None:
            display(HTML('<h4>Not Found recommendations</h4>'))
            return


        df = self.__data.sort_values(by=sort_by, ascending=sort_ascending)[:k]

        
        display(HTML(f'<h4><b>Tags:</b> {self.__as_tags(df, "genres")}</h4>'))
        
        
        df['image'] = df.apply(lambda row: to_image_html(row['poster'], width=image_width, alt=row['title']), axis=1)
    
        if transpose:
            df = df[['image', 'genres', 'rating', 'user_sim_weighted_rating_score', 'user_sim_weighted_pred_rating_score', 'pred_user_rating', 'user_item_sim']]

            df['genres'] = df['genres'].apply(lambda genres: '<ul>' +  ''.join([F'<li style="text-align:left;vertical-align:top;">{g.capitalize()}</li>' for g in genres]) + '</ul>')
            
            df = df.rename(columns={
                'user_sim_weighted_rating_score' : 'User sim weighted rating score',
                'user_sim_weighted_pred_rating_score' : 'User sim weighted predicted rating score',
                'pred_user_rating': 'Predicted Rating',
                'user_item_sim': 'User Item Similarity',
                'norm_mean_rating': 'Norm Mean Rating',
                'rating' : 'Mean Rating',
                'image'  : 'Poster',
                'genres' : 'Genres'
            })

            df = df.reset_index()
            df = df.T.drop(['index'])

        else:
            df = df[['user_sim_weighted_rating_score', 'user_sim_weighted_pred_rating_score', 'pred_user_rating', 'user_item_sim', 'rating', 'image', 'genres']]
            df = df.rename(columns={
                'user_sim_weighted_rating_score' : 'User sim weighted rating score',
                'user_sim_weighted_pred_rating_score' : 'User sim weighted predicted rating score',
                'pred_user_rating': 'Predicted Rating',
                'user_item_sim': 'User Item Similarity',
                'norm_mean_rating': 'Norm Mean Rating',
                'rating' : 'Mean Rating',
                'image'  : 'Poster',
                'genres' : 'Genres'
            }).reset_index()

        display(HTML(df.to_html(escape=False)))
        

    
    def __as_tags(self, df, column):
        tag_counts =pd.Series(np.concatenate(df[column].apply(np.array).values)).value_counts().to_dict()
        return ', '.join([f"<b>{tag.replace(')', '').replace('(', '')}</b>({count})" for tag, count in tag_counts.items()])
        
        
    def show_seen(
        self,
        k              = 1000,
        sort_by        = ['rating'],
        sort_ascending = False,
        image_width    = 340,
        transpose      = True
    ):
        display(HTML(f'<h2>User Seen items</h2>'))

        if self.__seen_items is None:
            print('Not Found items!')
            return
        
        df = self.__seen_items.sort_values(by=sort_by, ascending=sort_ascending)[:k]

        display(HTML(f'<h4><b>Tags:</b> {self.__as_tags(df, "genres")}</h4>'))


        df['image'] = df.apply(lambda row: to_image_html(row['poster'], width=image_width, alt=row['title']), axis=1)
            
        if transpose:
            df = df[['image', 'genres', 'rating']] 
            
            df['genres'] = df['genres'].apply(lambda genres: '<ul>' +  ''.join([F'<li style="text-align:left;vertical-align:top;">{g.capitalize()}</li>' for g in genres]) + '</ul>')
          
            df = df.rename(columns={
                    'image'  : 'Poster',
                    'genres' : 'Genres',
                    'rating' : 'Rating',
                })
            df = df.reset_index()
            df = df.T.drop(['index'])
            
        else:
            df = df[['rating', 'image', 'genres']] 
            df = df.rename(columns={
                'rating' : 'Rating',
                'image'  : 'Poster',
                'genres' : 'Genres'
            }).reset_index()
        

        display(HTML(df.to_html(escape=False)))

In [336]:
class DatabaseUserItemFilteringRecommender:
    def __init__(
        self,
        user_emb_repository,
        items_repository,
        interactions_repository,
        pred_interactions_repository
    ):
        self.__user_emb_repository          = user_emb_repository
        self.__items_repository             = items_repository
        self.__interactions_repository      = interactions_repository
        self.__pred_interactions_repository = pred_interactions_repository


    def __users_distance(self, similar_users):
        return { similar_users.str_ids[idx]: similar_users.distances[idx] for idx in range(len(similar_users.str_ids)) }            

    
    
    def __select_interactions(self, interactions, percent, max_items_by_user):
        interactions_by_user_id = {}
        for i in interactions:
            if i.user_id not in interactions_by_user_id:                
                interactions_by_user_id[i.user_id] = []

            inters = interactions_by_user_id[i.user_id]
                
            if random.random() >= percent and len(inters) < max_items_by_user:
                inters.append(i)
    
        interactions = []
        for inters in interactions_by_user_id.values():
            interactions.extend(inters)
        
        return interactions
    
    
    async def recommend(
        self,
        user_id                        : int  = None,
        not_seen                       : bool = True,
        k_sim_users                    : int = 5,
        random_selection_items_by_user : int = 0.5,
        max_items_by_user              : int = 5
    ):
        similar_users = self.__user_emb_repository.find_similars_by_id(user_id, limit=k_sim_users)
        
        if similar_users.empty: return []

        similar_user_ids = [id for id in similar_users.str_ids if id != user_id]

        if len(similar_user_ids) ==0:
            logging.warning('Not found similar users')
            return []

        interactions = await self.__interactions_repository.find_many_by(user_id={'$in': similar_user_ids})
        
        interactions = self.__select_interactions(
            interactions,
            percent           = random_selection_items_by_user,
            max_items_by_user = max_items_by_user
        )
    
        if len(interactions) ==0:
            logging.warning('Not found sumilar users interactions')
            return []
        
        item_ids = np.unique([i.item_id for i in interactions]).tolist()
        
        if len(item_ids) ==0:
            logging.warning('Not found items')
            return []

        user_interactions = await self.__interactions_repository.find_many_by(user_id=user_id)
 
        if not_seen:
            seen_item_ids = [i.item_id for i in user_interactions]
            item_ids = [item_id for item_id in item_ids if item_id not in seen_item_ids]

        
        items = await self.__items_repository.find_many_by(item_id={'$in': item_ids})
    
        pred_interactions =  await self.__pred_interactions_repository.find_many_by(
            user_id=user_id,
            item_id={'$in': item_ids}
        )
        pred_rating_by_item_id = {i.item_id: i.rating for i in pred_interactions}
        
        
        distance_by_user_id = self.__users_distance(similar_users)
        
        distance_by_item_id = {i.item_id:distance_by_user_id[i.user_id] for i in interactions}
        
        max_rating      = np.max([item.rating for item in items])
        
        max_pred_rating = np.max(list(pred_rating_by_item_id.values()))


        scored_items  = []
        for item in items:
            item_sim   = (1 - distance_by_item_id[item.id])

            norm_rating = item.rating / max_rating

            item_score1  = norm_rating * item_sim
            
            norm_pred_rating = pred_rating_by_item_id.get(item.id, 0) / max_pred_rating
                
            item_score2  = norm_pred_rating * item_sim
            
            scored_items.append((item, item_score1, item_score2, item_sim, norm_rating))

    
        recommended_items = pd.DataFrame([
            {
                'id'    : item[0].id,
                'user_sim_weighted_rating_score'      : item[1],
                'user_sim_weighted_pred_rating_score' : item[2],
                'user_item_sim'                       : item[3],
                'pred_user_rating'                    : pred_rating_by_item_id.get(item[0].id, 0),
                'rating': item[0].rating,
                'title' : item[0].title,
                'poster': item[0].poster,
                'genres': item[0].genres
            }
            for item in scored_items
        ])
        
        
        seen_item_rating_by_id = {i.item_id: i.rating for i in user_interactions}
        seen_items = await self.__items_repository.find_many_by(
            item_id={'$in': list(seen_item_rating_by_id.keys())}
        )
   
        seen_items = pd.DataFrame([
            {
                'id'    : item.id,
                'rating': seen_item_rating_by_id.get(item.id, 0),
                'title' : item.title,
                'poster': item.poster,
                'genres': item.genres
            }
            for item in seen_items
        ])
        
        return DatabaseUserItemFilteringRecommenderResult(
            self.__class__.__name__,
            recommended_items,
            seen_items
        )

In [337]:
recommender = DatabaseUserItemFilteringRecommender(
    user_emb_repository          = ctx.users_cf_emb_repository,
    items_repository             = ctx.items_repository,
    interactions_repository      = ctx.interactions_repository,
    pred_interactions_repository = ctx.pred_interactions_repository,

)

recommendations = await recommender.recommend(
    user_id           = "adrianmarino@gmail.com",
    k_sim_users       = 20,
    max_items_by_user = 5
)

In [338]:
recommendations.show_seen()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
Poster,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Genres,AdventureAnimationChildrenComedyFantasy,ActionAdventureSci-fi,ActionSci-fiThrillerImax,AdventureAnimationChildrenComedyFantasy,ActionAdventureAnimationChildrenComedy,DramaMysterySci-fiThriller,AdventureAnimationChildrenComedyDramaFantasy,ActionAdventureSci-fi,AdventureAnimationChildrenComedyFantasy,ActionAdventure,ActionCrimeDramaImax,AdventureAnimationChildrenComedyFantasyImax,ActionCrimeDramaMysterySci-fiThrillerImax,Sci-fiImax,ActionAdventureFantasyImax,DramaMysteryThriller,AdventureAnimationChildrenComedy,ActionAdventureSci-fiThrillerWar,MysteryThriller,ActionAdventureAnimationChildrenComedy,AnimationChildrenDrama,ActionSci-fi,ActionAdventureSci-fiImax,ActionAdventureSci-fi,AdventureAnimationChildrenComedyFantasy,AdventureAnimationComedy,War,ActionSci-fiThriller,ActionAdventureSci-fi,ActionAdventureSci-fi,ActionAdventureAnimationChildrenComedyFantasy,AdventureAnimationChildrenComedyFantasy,ActionAdventureSci-fiImax,ActionSci-fiImax,Comedy,ActionAdventureSci-fiThrillerImax,Comedy,ActionAdventureSci-fi,ComedyDramaRomance
Rating,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,2.0


In [339]:
recommendations.show(
    sort_by        = ['user_sim_weighted_pred_rating_score'],
    k              = 10,
    transpose      = True
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Poster,,,,,,,,,,
Genres,ComedyDramaRomance,AdventureAnimationDramaFantasySci-fi,ActionSci-fiThriller,Comedy,AdventureFantasy,CrimeMysteryThriller,Comedy,Drama,Drama,HorrorSci-fi
Mean Rating,3.0,4.0,4.272401,4.044776,4.232258,3.971429,3.956522,3.957143,3.97619,4.132479
User sim weighted rating score,0.593739,0.792099,0.845178,0.799971,0.837065,0.785916,0.782516,0.782337,0.787737,0.817596
User sim weighted predicted rating score,0.989565,0.950177,0.942475,0.938053,0.899265,0.897509,0.891985,0.888523,0.888106,0.888004
Predicted Rating,5.031342,4.828352,4.794106,4.772671,4.575248,4.563766,4.538283,4.522411,4.510912,4.516483
User Item Similarity,0.989565,0.990123,0.989114,0.988894,0.98891,0.989462,0.988894,0.988513,0.990567,0.989232


In [340]:
recommendations.show(
    sort_by        = ['user_sim_weighted_rating_score'],
    k              = 10
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Poster,,,,,,,,,,
Genres,ActionAdventureDramaFantasy,ActionSci-fiThriller,AdventureFantasy,DramaHorrorRomance,ActionAdventureSci-fi,CrimeDrama,HorrorSci-fi,Drama,Drama,ComedyMysteryThriller
Mean Rating,5.0,4.272401,4.232258,4.225,4.208738,4.143617,4.132479,4.10929,4.107317,4.068966
User sim weighted rating score,0.988815,0.845178,0.837065,0.835288,0.832078,0.819533,0.817596,0.812731,0.812145,0.805218
User sim weighted predicted rating score,0.823705,0.942475,0.899265,0.87747,0.86195,0.87572,0.888004,0.874495,0.829102,0.73315
Predicted Rating,4.19122,4.794106,4.575248,4.466183,4.387161,4.455455,4.516483,4.449297,4.219355,3.728013
User Item Similarity,0.988815,0.989114,0.98891,0.988506,0.988513,0.98891,0.989232,0.988894,0.988657,0.989462


In [341]:
recommendations.show(
    sort_by        = ['user_item_sim'],
    k              = 10
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Poster,,,,,,,,,,
Genres,CrimeDrama,Drama,Drama,ComedyDrama,DramaRomanceSci-fi,AdventureAnimationDramaFantasySci-fi,(no genres listed),ComedyDrama,ActionSci-fiImax,ActionAdventureComedyCrime
Mean Rating,3.555556,3.97619,3.6,3.811688,3.902439,4.0,3.181818,3.333333,4.009524,3.650685
User sim weighted rating score,0.704403,0.787737,0.713208,0.755147,0.773126,0.792099,0.630078,0.660082,0.793984,0.722926
User sim weighted predicted rating score,0.7934,0.888106,0.731931,0.798283,0.84091,0.950177,0.824596,0.747846,0.849907,0.754244
Predicted Rating,4.029877,4.510912,3.717661,4.054683,4.271195,4.828352,4.19021,3.800201,4.318829,3.832714
User Item Similarity,0.990567,0.990567,0.990567,0.990567,0.990567,0.990123,0.990123,0.990123,0.990123,0.990123
