# Recommender ChatBot: Collaborative Filtering recommender using user embeddings from chromadb

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH    = '../..'
LIB_PATH     = f'{BASE_PATH}/lib'
API_PATH     = f'{BASE_PATH}/chat-bot-api'

In [3]:
import sys
sys.path.append(LIB_PATH)
sys.path.append(API_PATH)


import torch
import pytorch_common
import pytorch_common.util as pu
import util as ut
import os
import numpy as np
import logging

import pandas as pd
from IPython.core.display import HTML
from recommender import RecommenderResult, to_image_html
import random

2024-02-22 19:56:21.751240: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-22 19:56:22.517541: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-22 19:56:22.528614: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
import pytorch_common
pytorch_common.__version__

'0.3.8'

In [6]:
torch.__version__

'2.1.2+cu118'

In [7]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available()

(device(type='cuda', index=0), True)

In [8]:
ut.set_seed(42)

In [9]:
os.environ['TMP_PATH']         = f'{BASE_PATH}/tmp'
os.environ['DATASET_PATH']     = f'{BASE_PATH}/datasets'
os.environ['WEIGHTS_PATH']     = f'{BASE_PATH}/weights'
os.environ['METRICS_PATH']     = f'{BASE_PATH}/metrics'
os.environ['MONGODB_URL']      = 'mongodb://0.0.0.0:27017'
os.environ['MONGODB_DATABASE'] = 'chatbot'
os.environ['CHROMA_HOST']      = '0.0.0.0'
os.environ['CHROMA_PORT']      = '9090'


from app_context import AppContext

## Recommender

In [10]:
ctx = AppContext()

2024-02-22 19:56:24,237 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2024-02-22 19:56:24,800 - INFO - Use pytorch device: cuda
2024-02-22 19:56:24,802 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-02-22 19:56:24,827 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [96]:
class DatabaseUserItemFilteringRecommenderResult(RecommenderResult):
    def __init__(
        self,
        recommender_name,
        data,
        seen_items
    ):
        self.__recommender_name = recommender_name
        self.__data             = data
        self.__seen_items       = seen_items


    @property
    def data(self): return self.__data

    @property
    def seen(self): return self.__seen_items

    def show(
        self, 
        k              = 5,
        sort_by        = ['pred_user_rating', 'user_distance_weighted_rating_score'],
        sort_ascending = False,
        image_width    = 340,
        transpose      = True
    ):
        display(HTML(f'<h2>Recommendations</h2>'))
      
        if self.__data is None:
            display(HTML('<h4>Not Found recommendations</h4>'))
            return


        df = self.__data.sort_values(by=sort_by, ascending=sort_ascending)[:k]

        
        display(HTML(f'<h4><b>Tags:</b> {self.__as_tags(df, "genres")}</h4>'))
        
        
        df['image'] = df.apply(lambda row: to_image_html(row['poster'], width=image_width, alt=row['title']), axis=1)
    
        if transpose:
            df = df[[
                'image',
                'genres',
                'rating',
                'pred_user_rating',
                'user_sim_weighted_rating_score',
                'user_sim_weighted_pred_rating_score', 
                'user_item_sim'
            ]]

            df['genres'] = df['genres'].apply(lambda genres: '<ul>' +  ''.join([F'<li style="text-align:left;vertical-align:top;">{g.capitalize()}</li>' for g in genres]) + '</ul>')
            
            df = df.rename(columns={
                'user_sim_weighted_rating_score' : 'User sim weighted rating score',
                'user_sim_weighted_pred_rating_score' : 'User sim weighted predicted rating score',
                'user_item_sim': 'User Item Similarity',
                'pred_user_rating': 'Predicted Rating',
                'rating' : 'Mean Rating',
                'image'  : 'Poster',
                'genres' : 'Genres'
            })

            df = df.reset_index()
            df = df.T.drop(['index'])

        else:
            df = df[['user_sim_weighted_rating_score', 'user_sim_weighted_pred_rating_score', 'pred_user_rating', 'user_item_sim', 'rating', 'image', 'genres']]
            df = df.rename(columns={
                'user_sim_weighted_rating_score' : 'User sim weighted rating score',
                'user_sim_weighted_pred_rating_score' : 'User sim weighted predicted rating score',
                'pred_user_rating': 'Predicted Rating',
                'user_item_sim': 'User Item Similarity',
                'rating' : 'Mean Rating',
                'image'  : 'Poster',
                'genres' : 'Genres'
            }).reset_index()

        display(HTML(df.to_html(escape=False)))
        display(HTML(f'<h5><b>Generated by</b> {self.__recommender_name}.</h5>'))

    
    def __as_tags(self, df, column):
        tag_counts =pd.Series(np.concatenate(df[column].apply(np.array).values)).value_counts().to_dict()
        return ', '.join([f"<b>{tag.replace(')', '').replace('(', '')}</b>({count})" for tag, count in tag_counts.items()])
        
        
    def show_seen(
        self,
        k              = 1000,
        sort_by        = ['rating'],
        sort_ascending = False,
        image_width    = 340,
        transpose      = True
    ):
        display(HTML(f'<h2>User Seen items</h2>'))

        if self.__seen_items is None:
            print('Not Found items!')
            return
        
        df = self.__seen_items.sort_values(by=sort_by, ascending=sort_ascending)[:k]

        display(HTML(f'<h4><b>Tags:</b> {self.__as_tags(df, "genres")}</h4>'))


        df['image'] = df.apply(lambda row: to_image_html(row['poster'], width=image_width, alt=row['title']), axis=1)
            
        if transpose:
            df = df[['image', 'genres', 'rating']] 
            
            df['genres'] = df['genres'].apply(lambda genres: '<ul>' +  ''.join([F'<li style="text-align:left;vertical-align:top;">{g.capitalize()}</li>' for g in genres]) + '</ul>')
          
            df = df.rename(columns={
                    'image'  : 'Poster',
                    'genres' : 'Genres',
                    'rating' : 'Rating',
                })
            df = df.reset_index()
            df = df.T.drop(['index'])
            
        else:
            df = df[['rating', 'image', 'genres']] 
            df = df.rename(columns={
                'rating' : 'Rating',
                'image'  : 'Poster',
                'genres' : 'Genres'
            }).reset_index()
        

        display(HTML(df.to_html(escape=False)))

In [97]:
class DatabaseUserItemFilteringRecommender:
    def __init__(
        self,
        user_emb_repository,
        items_repository,
        interactions_repository,
        pred_interactions_repository
    ):
        self.__user_emb_repository          = user_emb_repository
        self.__items_repository             = items_repository
        self.__interactions_repository      = interactions_repository
        self.__pred_interactions_repository = pred_interactions_repository


    def __users_distance(self, similar_users):
        return { similar_users.str_ids[idx]: similar_users.distances[idx] for idx in range(len(similar_users.str_ids)) }            

    
    
    def __select_interactions(self, interactions, percent, max_items_by_user):
        interactions_by_user_id = {}
        for i in interactions:
            if i.user_id not in interactions_by_user_id:                
                interactions_by_user_id[i.user_id] = []

            inters = interactions_by_user_id[i.user_id]
                
            if random.random() >= percent and len(inters) < max_items_by_user:
                inters.append(i)
    
        interactions = []
        for inters in interactions_by_user_id.values():
            interactions.extend(inters)
        
        return interactions
    
    
    async def recommend(
        self,
        user_id                        : int  = None,
        not_seen                       : bool = True,
        k_sim_users                    : int = 5,
        random_selection_items_by_user : int = 0.5,
        max_items_by_user              : int = 5
    ):
        similar_users = self.__user_emb_repository.find_similars_by_id(user_id, limit=k_sim_users)
        
        if similar_users.empty: return []

        similar_user_ids = [id for id in similar_users.str_ids if id != user_id]

        if len(similar_user_ids) ==0:
            logging.warning('Not found similar users')
            return []

        interactions = await self.__interactions_repository.find_many_by(user_id={'$in': similar_user_ids})
        
        interactions = self.__select_interactions(
            interactions,
            percent           = random_selection_items_by_user,
            max_items_by_user = max_items_by_user
        )
    
        if len(interactions) ==0:
            logging.warning('Not found sumilar users interactions')
            return []
        
        item_ids = np.unique([i.item_id for i in interactions]).tolist()
        
        if len(item_ids) ==0:
            logging.warning('Not found items')
            return []

        user_interactions = await self.__interactions_repository.find_many_by(user_id=user_id)
 
        if not_seen:
            seen_item_ids = [i.item_id for i in user_interactions]
            item_ids = [item_id for item_id in item_ids if item_id not in seen_item_ids]

        
        items = await self.__items_repository.find_many_by(item_id={'$in': item_ids})
    
        pred_interactions =  await self.__pred_interactions_repository.find_many_by(
            user_id=user_id,
            item_id={'$in': item_ids}
        )
        pred_rating_by_item_id = {i.item_id: i.rating for i in pred_interactions}
        
        
        distance_by_user_id = self.__users_distance(similar_users)
        
        distance_by_item_id = {i.item_id:distance_by_user_id[i.user_id] for i in interactions}
        
        max_rating      = np.max([item.rating for item in items])
        
        max_pred_rating = np.max(list(pred_rating_by_item_id.values()))


        scored_items  = []
        for item in items:
            item_sim   = (1 - distance_by_item_id[item.id])

            norm_rating = item.rating / max_rating

            item_score1  = norm_rating * item_sim
            
            norm_pred_rating = pred_rating_by_item_id.get(item.id, 0) / max_pred_rating
                
            item_score2  = norm_pred_rating * item_sim
            
            scored_items.append((item, item_score1, item_score2, item_sim, norm_rating))

    
        recommended_items = pd.DataFrame([
            {
                'id'    : item[0].id,
                'user_sim_weighted_rating_score'      : item[1],
                'user_sim_weighted_pred_rating_score' : item[2],
                'user_item_sim'                       : item[3],
                'pred_user_rating'                    : pred_rating_by_item_id.get(item[0].id, 0),
                'rating': item[0].rating,
                'title' : item[0].title,
                'poster': item[0].poster,
                'genres': item[0].genres
            }
            for item in scored_items
        ])
        
        
        seen_item_rating_by_id = {i.item_id: i.rating for i in user_interactions}
        seen_items = await self.__items_repository.find_many_by(
            item_id={'$in': list(seen_item_rating_by_id.keys())}
        )
   
        seen_items = pd.DataFrame([
            {
                'id'    : item.id,
                'rating': seen_item_rating_by_id.get(item.id, 0),
                'title' : item.title,
                'poster': item.poster,
                'genres': item.genres
            }
            for item in seen_items
        ])
        
        return DatabaseUserItemFilteringRecommenderResult(
            self.__class__.__name__,
            recommended_items,
            seen_items
        )

In [98]:
recommender = DatabaseUserItemFilteringRecommender(
    user_emb_repository          = ctx.users_cf_emb_repository,
    items_repository             = ctx.items_repository,
    interactions_repository      = ctx.interactions_repository,
    pred_interactions_repository = ctx.pred_interactions_repository,

)

recommendations = await recommender.recommend(
    user_id           = "adrianmarino@gmail.com",
    k_sim_users       = 20,
    max_items_by_user = 5,
    not_seen          = True
)

In [99]:
recommendations.show_seen()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
Poster,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Genres,AdventureAnimationChildrenComedyFantasy,ActionAdventureSci-fi,ActionSci-fiThrillerImax,AdventureAnimationChildrenComedyFantasy,ActionAdventureAnimationChildrenComedy,DramaMysterySci-fiThriller,AdventureAnimationChildrenComedyDramaFantasy,ActionAdventureSci-fi,AdventureAnimationChildrenComedyFantasy,ActionAdventure,ActionCrimeDramaImax,AdventureAnimationChildrenComedyFantasyImax,ActionCrimeDramaMysterySci-fiThrillerImax,Sci-fiImax,ActionAdventureFantasyImax,DramaMysteryThriller,AdventureAnimationChildrenComedy,ActionAdventureSci-fiThrillerWar,MysteryThriller,ActionAdventureAnimationChildrenComedy,AnimationChildrenDrama,ActionSci-fi,ActionAdventureSci-fiImax,ActionAdventureSci-fi,AdventureAnimationChildrenComedyFantasy,AdventureAnimationComedy,War,ActionSci-fiThriller,ActionAdventureSci-fi,ActionAdventureSci-fi,ActionAdventureAnimationChildrenComedyFantasy,AdventureAnimationChildrenComedyFantasy,ActionAdventureSci-fiImax,ActionSci-fiImax,Comedy,ActionAdventureSci-fiThrillerImax,Comedy,ActionAdventureSci-fi,ComedyDramaRomance
Rating,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,2.0


In [100]:
recommendations.show(
    sort_by        = ['user_sim_weighted_pred_rating_score'],
    k              = 10
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Poster,,,,,,,,,,
Genres,DramaWar,ActionSci-fiThriller,Comedy,AdventureComedySci-fi,ActionAdventureAnimationDramaFantasy,HorrorSci-fi,Thriller,ActionComedySci-fi,ComedyDramaWar,DramaHorrorRomance
Mean Rating,3.864865,4.272401,4.044776,4.101064,4.217687,4.132479,3.964286,3.786517,4.25,4.225
Predicted Rating,4.854262,4.794106,4.772671,4.655716,4.526951,4.516483,4.507064,4.494987,4.484792,4.466183
User sim weighted rating score,0.850376,0.939087,0.888857,0.901674,0.926805,0.90844,0.871099,0.831952,0.933881,0.928098
User sim weighted predicted rating score,0.990123,0.976856,0.972273,0.948918,0.922166,0.920398,0.918091,0.915537,0.913554,0.909479
User Item Similarity,0.990123,0.989114,0.988894,0.989386,0.988841,0.989232,0.988815,0.988715,0.988815,0.988506


### Notes
* Movies seen by similar users weighted by user predicted rating.
* Ordered by **User sim weighted predicted rating score**.
* **User sim weighted predicted rating score** = similar_user_similarity(0..1) * user_predicted_rating (Normalize to 0..1)

In [101]:
recommendations.show(
    sort_by        = ['user_sim_weighted_rating_score'],
    k              = 10
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Poster,,,,,,,,,,
Genres,ComedyDramaRomance,Drama,ActionSci-fiThriller,Comedy,ComedyDramaWar,DramaHorrorRomance,ActionAdventureAnimationDramaFantasy,ActionAdventureSci-fi,CrimeDrama,HorrorSci-fi
Mean Rating,4.5,4.310345,4.272401,4.25,4.25,4.225,4.217687,4.208738,4.143617,4.132479
Predicted Rating,4.365303,4.462064,4.794106,4.009136,4.484792,4.466183,4.526951,4.387161,4.455455,4.516483
User sim weighted rating score,0.98934,0.946852,0.939087,0.934492,0.933881,0.928098,0.926805,0.924532,0.910592,0.90844
User sim weighted predicted rating score,0.889686,0.908647,0.976856,0.817197,0.913554,0.909479,0.922166,0.893394,0.907665,0.920398
User Item Similarity,0.98934,0.988513,0.989114,0.989462,0.988815,0.988506,0.988841,0.988513,0.98891,0.989232


### Notes
* Movies seen by similar users weighted by movie mean rating.
* Ordered by **User sim weighted rating score**.
* **User sim weighted rating score** = similar_user_similarity(0..1) * mean_movie_rating (Normalize to 0..1) 

In [102]:
recommendations.show(
    sort_by        = ['user_item_sim'],
    k              = 10
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Poster,,,,,,,,,,
Genres,CrimeDrama,DramaMysteryThriller,Thriller,ComedyDrama,Sci-fi,ActionAdventureComedyCrime,ActionSci-fiImax,DramaWar,ComedyDrama,Drama
Mean Rating,3.555556,4.025,3.857143,4.119658,4.037037,3.650685,4.009524,3.864865,3.333333,3.772727
Predicted Rating,4.029877,4.216056,3.970502,4.372216,4.248519,3.832714,4.318829,4.854262,3.800201,4.229575
User sim weighted rating score,0.78267,0.886007,0.849058,0.906844,0.888259,0.803251,0.882205,0.850376,0.733425,0.829636
User sim weighted predicted rating score,0.822342,0.860334,0.810226,0.8922,0.86657,0.781758,0.880911,0.990123,0.775126,0.86222
User Item Similarity,0.990567,0.990567,0.990567,0.990567,0.990123,0.990123,0.990123,0.990123,0.990123,0.989565


### Notes
* Movies seen by similar users.
* Ordered by **similar user similarity by movie**.
* take movies sample from each similar user, then assign similary and order by similary.