# Recommender ChatBot: Load items and interactions to API

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

BASE_PATH    = '../..'
LIB_PATH     = f'{BASE_PATH}/lib'
API_PATH     = f'{BASE_PATH}/chat-bot-api'
DATASET_PATH = f'{BASE_PATH}/datasets'
ITEMS_PATH   = f'{DATASET_PATH}/chatbot-api-movies.json'


os.environ['TMP_PATH']               = f'{BASE_PATH}/tmp'
os.environ['DATASET_PATH']           = f'{BASE_PATH}/datasets'
os.environ['WEIGHTS_PATH']           = f'{BASE_PATH}/weights'
os.environ['METRICS_PATH']           = f'{BASE_PATH}/metrics'
os.environ['MONGODB_URL']            = 'mongodb://0.0.0.0:27017'
os.environ['MONGODB_DATABASE']       = 'chatbot'
os.environ['CHROMA_HOST']            = '0.0.0.0'
os.environ['CHROMA_PORT']            = '9090'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
import sys
sys.path.append(LIB_PATH)
sys.path.append(API_PATH)

import pandas as pd

import util as ut

import torch
import data as dt
import data.dataset as ds

import os

from rest import RecChatBotV1ApiClient

from services import ItemSimQuery

from app_context import AppContext

import pytorch_common.util as pu
from datetime import datetime

from rest import TMDBApiClient

import logging
from dataclasses import dataclass
from abc import ABC, abstractmethod
from IPython.core.display import HTML

2024-02-25 16:20:13.550170: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-25 16:20:14.334349: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-25 16:20:14.344792: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

# Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

# Common Functions and Classes

In [43]:
def cos_sim(a, b, k=1):
    from strsimpy import Cosine
    sim = Cosine(k=k)
    return 1 - sim.distance(a, b)

In [6]:
def is_empty(value):
    if value is None: return True
    
    if type(value) == str and len(value) == 0: return True

    if type(value) == list:
        if len(value) == 0: return True
        if len([True for v in value if is_empty(v)]) == len(value): return True

    return False

In [7]:
def items_to_df(models):
    return pd.DataFrame([
        {
            'id'          : model.id,
            'title'       : model.title,
            'release'     : model.release,
            'genres'      : model.genres,
            'rating'      : model.rating,
            'poster'      : model.poster
        }
        for model in models
    ])


def ints_to_df(models):
    return pd.DataFrame([
        {
            'item_id' : model.user_id,
            'user_id' : model.item_id,
            'rating'  : model.rating
        }
        for model in models
    ])

In [8]:
async def append_popularity_score(items, interactions):
    from sklearn.preprocessing import MinMaxScaler

    item_votes = interactions \
        .groupby(['item_id'])['user_id'] \
        .size() \
        .reset_index(name='votes') \
        .rename(columns={'item_id': 'id'})

    
    item_votes['id'] = item_votes['id'].astype(str)

    items = item_votes.merge(items, on='id')
    
    items['votes_norm']       = items['votes'] / items['votes'].max()
    items['popularity_score'] = items['rating'] * items['votes_norm']
    items = items.drop(['votes_norm'], axis=1)

    return items.sort_values(
        by        = ['popularity_score'],
        ascending = False
    )

In [9]:
def to_image_html(path, width=300, alt='Not Found Image'): return F'<img src="{path}" width="{width}" alt={alt} >'

def show(df): display(HTML(df.to_html(escape=False)))

In [10]:
async def search_items_by_content(
    interactions,
    content,
    min_release      = None,
    min_rating       = None,
    order_popularity = True,
    limit            = 10_000
):
    items, _ = await ctx.item_service.find_similars_by(
        ItemSimQuery() \
            .contains(content) \
            .rating_gte(min_rating) \
            .release_gte(min_release) \
            .limit_eq(limit)
    )
    
    items = items_to_df(items)
                
    if order_popularity:
        items = await append_popularity_score(items, interactions)

    items['poster'] = items['poster'].apply(to_image_html)
    
    return items.reset_index()

# Load Dataset

In [11]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

features_fn = lambda obs, device: to_tensor(obs, device, ['user_seq', 'movie_seq'])
target_fn   = lambda obs, device: to_tensor(obs, device, ['user_movie_rating'])

In [12]:
dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    path             = DATASET_PATH,
    transform        = features_fn,
    target_transform = target_fn,
    device           = torch.device('cpu'),
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2004)]
)

In [13]:
if not os.path.exists(ITEMS_PATH):
    items = dataset \
        .data[['movie_id', 'movie_title', 'movie_genres', 'movie_overview', 'movie_release_year', 'movie_imdb_id', 'user_movie_rating']]

    item_mean_rating = items.groupby(['movie_id'])['user_movie_rating'].mean().reset_index()
    items = items.drop_duplicates(subset=['movie_id']).drop(columns=['user_movie_rating'])
    items[items['movie_title'] == 'Paths of Glory']
    items = items.merge(item_mean_rating, on='movie_id')
    items

## Populate items poster

In [14]:
if not os.path.exists(ITEMS_PATH):
    from rest import TMDBApiClient
    import logging

    client = TMDBApiClient()

    with dt.progress_bar(items.shape[0], title='Processing') as pb:
        def resolve_url(title):
            return (title, client.find_first_poster_by(title))

        params = [[row['movie_title']] for _, row in items.iterrows()]

        poster_by_id = ut.ParallelExecutor()(
            resolve_url,
            params = params,
            fallback_result = None
        )

        poster_by_id = {item[0]: item[1] for item in poster_by_id}

In [15]:
if not os.path.exists(ITEMS_PATH):
    items['poster'] = items['movie_title'].apply(lambda title: poster_by_id[title])
    ut.save_df(items, f'{DATASET_PATH}/chatbot-api-movies.json')

## Load builded items

In [177]:
items = ut.load_df(f'{DATASET_PATH}/chatbot-api-movies.json')

In [178]:
pd.set_option('display.max_colwidth', None)
items[['movie_title', 'poster']].head()

Unnamed: 0,movie_title,poster
0,Toy Story,http://image.tmdb.org/t/p/w500/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,Clueless,http://image.tmdb.org/t/p/w500/8AwVTcgpTnmeOs4TdTWqcFDXEsA.jpg
2,While You Were Sleeping,http://image.tmdb.org/t/p/w500/qNGO3ETcNwlWqK2kNRpbJSJRlos.jpg
3,Forrest Gump,http://image.tmdb.org/t/p/w500/arw2vcBveWOVZr6pxd9XTd1TdQa.jpg
4,Pretty Woman,http://image.tmdb.org/t/p/w500/hVHUfT801LQATGd26VPzhorIYza.jpg


In [179]:
interactions = dataset \
    .data[['user_id', 'movie_id', 'user_movie_rating', 'user_movie_rating_timestamp']] \
    .rename(columns={
        'user_movie_rating'           : 'rating',
        'movie_id'                    : 'item_id',
        'user_movie_rating_timestamp' : 'timestamp'
    })

interactions.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,100538,1,4,2019-01-01 21:11:09
496,100538,39,3,2019-01-03 11:58:44
8785,100538,339,3,2019-01-04 08:18:24
9194,100538,356,4,2019-01-03 10:37:28
14189,100538,597,4,2019-01-03 11:55:57


## Populate missing data using TMDB Api

In [180]:
@dataclass(frozen=True)
class MovieDataSeeker:
    data                   : pd.DataFrame
    min_title_sim          : int          = 0.98
    min_original_title_sim : int          = 0.98

    
    def search_many_by(self, title_by_id, fields=[]):
        movies = {}
        with dt.progress_bar(len(title_by_id), title='Fetching many movies from TMDB') as pb:
                
            for id, title in title_by_id.items():
                movie = self.search_by(id, title, fields)
                if movie:
                    movies[id] = movie
                pb.update()

        return movies


    def search_by(self, id, title, fields=[]):
        movie_data = {}
        movies = self.data[id]
        if len(movies) > 0:
            for movie in movies:
                title_sim          = cos_sim(title, movie['title'])
                original_title_sim = cos_sim(title, movie['original_title'])

                title_eq = movie['title'].strip().lower() == title.strip().lower() or movie['original_title'].strip().lower() == title.strip().lower()
                
                title_match = title_sim > self.min_title_sim or original_title_sim > self.min_title_sim

                if title_eq or title_match:
                    logging.debug(f'title sim: {title_sim}, original title sim: {original_title_sim}')

                    for field in fields:
                        value = movie[field]
                        if not is_empty(value):
                            movie_data['id'] = id

                            if type(movie[field]) == list:
                                movie_data[field] = [str(v).lower() for v in movie[field]] 
                            else:
                                movie_data[field] = movie[field]
                            
                            movie_data['title_sim']          = title_sim
                            movie_data['original_title_sim'] = original_title_sim


        return movie_data

In [181]:
item_title_by_id = ut.to_dict(
    items,
    key   = 'movie_id',
    value = 'movie_title'
)

item_title_by_id = {id: title.strip() for id, title in item_title_by_id.items()}

fields = [
     'title',
     'original_title', 
     'poster_url',
     'overview', 
     'genres',
     'release_date'
]

    
if not os.path.exists(f'{DATASET_PATH}/tmdb-api-raw-data.dt'):
    client = TMDBApiClient()

    data = client.parallel_find_many_movies_by(
        item_title_by_id,
        fields = fields
    )
    ut.Picket.save(f'{DATASET_PATH}/tmdb-api-raw-data.dt', data)
else:
    data = ut.Picket.load(f'{DATASET_PATH}/tmdb-api-raw-data.dt')

In [182]:
movie_seeker = MovieDataSeeker(data)

movies = movie_seeker.search_many_by(
    item_title_by_id,
    fields = fields
)

Fetching many movies from TMDB:   0%|          | 0/18608 [00:00<?, ?it/s]

In [183]:
ut.Picket.save(f'{DATASET_PATH}/movies.dt', movies)

In [184]:
movies = ut.Picket.load(f'{DATASET_PATH}/movies.dt')

In [185]:
len(movies), items.shape

(16149, (18608, 8))

In [186]:
def resolve_missing_data(movies, row, column, field, missing_codition):
    movie_id = row['movie_id']

    if movie_id not in movies:
        return row[column]

    if not missing_codition(row[column]):
        return row[column]

    if int(movie_id) not in data:
        return row[column]
    
    movie = movies[int(movie_id)]

    if movie is None:
        return row[column]

    if field not in movie:
        return row[column]

    return movie[field]

In [187]:
incomplete_items = items[((items['movie_overview'] == 'No overview found.') | (items['movie_overview'] == ''))]
incomplete_items.shape

(127, 8)

In [188]:
overview_missing_codition = lambda value: is_empty(value) or value == 'No overview found.'

items['movie_overview'] = items.apply(lambda row: resolve_missing_data(movies, row, column='movie_overview', field='overview', missing_codition=overview_missing_codition), axis=1) 

In [189]:
incomplete_items = items[((items['movie_overview'] == 'No overview found.') | (items['movie_overview'] == ''))]
incomplete_items.shape

(31, 8)

In [190]:
incomplete_items = items[items['movie_genres'].apply(lambda x: '(no genres listed)' in x)]
incomplete_items.shape

(286, 8)

In [191]:
genres_missing_codition = lambda value: len(value) == 1 and value[0].strip() == '(no genres listed)'

items['movie_genres'] = items.apply(lambda row: resolve_missing_data(movies, row, column='movie_genres', field='genres', missing_codition=genres_missing_codition), axis=1) 

In [192]:
incomplete_items = items[items['movie_genres'].apply(lambda x: '(no genres listed)' in x)]
incomplete_items.shape

(41, 8)

In [193]:
incomplete_items = items[items['movie_genres'].apply(lambda x: len(x) == 1)]
incomplete_items.shape

(6581, 8)

In [194]:
min_genres_missing_codition = lambda value: len(value) == 1

items['movie_genres'] = items.apply(lambda row: resolve_missing_data(movies, row, column='movie_genres', field='genres', missing_codition=min_genres_missing_codition), axis=1) 

In [195]:
incomplete_items = items[items['movie_genres'].apply(lambda x: len(x) == 1)]
incomplete_items.shape

(4050, 8)

## Upload item and interaction to RecChatBotAPI

In [196]:
ctx = AppContext()

2024-02-25 16:55:20,590 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2024-02-25 16:55:21,104 - INFO - Use pytorch device: cuda
2024-02-25 16:55:21,105 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-02-25 16:55:21,106 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [197]:
client = RecChatBotV1ApiClient()

In [198]:
client.add_items(items)

2024-02-25 16:55:22,927 - INFO - Page Size: 1000
2024-02-25 16:55:32,073 - INFO - Page: 1/19, Size: 1000
2024-02-25 16:55:37,674 - INFO - Page: 2/19, Size: 1000
2024-02-25 16:55:44,024 - INFO - Page: 3/19, Size: 1000
2024-02-25 16:55:49,590 - INFO - Page: 4/19, Size: 1000
2024-02-25 16:55:55,104 - INFO - Page: 5/19, Size: 1000
2024-02-25 16:56:00,822 - INFO - Page: 6/19, Size: 1000
2024-02-25 16:56:06,506 - INFO - Page: 7/19, Size: 1000
2024-02-25 16:56:13,517 - INFO - Page: 8/19, Size: 1000
2024-02-25 16:56:19,246 - INFO - Page: 9/19, Size: 1000
2024-02-25 16:56:25,004 - INFO - Page: 10/19, Size: 1000
2024-02-25 16:56:32,249 - INFO - Page: 11/19, Size: 1000
2024-02-25 16:56:38,229 - INFO - Page: 12/19, Size: 1000
2024-02-25 16:56:44,253 - INFO - Page: 13/19, Size: 1000
2024-02-25 16:56:51,213 - INFO - Page: 14/19, Size: 1000
2024-02-25 16:56:57,110 - INFO - Page: 15/19, Size: 1000
2024-02-25 16:57:02,816 - INFO - Page: 16/19, Size: 1000
2024-02-25 16:57:08,655 - INFO - Page: 17/19, Si

[]

In [None]:
interactions.shape

In [None]:
client.add_interactions(interactions)

In [None]:
api_interactions = ints_to_df(await ctx.interaction_service.find_all())
api_interactions.shape

## Fun users interactions

In [None]:
searchs = [
    ('pixar animated movie for children', 200),
    ('science fiction, action', 100),
    ('war weapons', 70),
    ('iron man, x-men, spider man, bat man, flash, avengers, ant-man, hulk, guardians of the galaxy, marvel, green lantern, superman, watchmen, thor, deadpool, wonder woman, strange, justice League, captain america, logans, kick ass, John Wick', 150),
    ('sci-fi, action, future', 130),
    ('Mission Impossible, spies, bourne identity, 007', 70),
    ('comedy movies', 200),
    ('horror', 100),
    ('thiller, suspense', 150),
    ('love, romance', 150),
    ('time travel', 100),
    ('dead, zombies, post apocalyptic', 100),
    ('software', 100),
    ('hackers', 100)
]

In [None]:
items = await search_items_by_content(
    interactions, 
    'hackers',
    min_rating  = 3, 
    limit       = 50,
    min_release = 1990,
    order_popularity = False
)


# show(items)

In [None]:
fun_user_interactions = []
max_user_id = interactions['user_id'].max()

for search in searchs:    
    items = await search_items_by_content(
        interactions, 
        search[0],
        min_rating  = 3, 
        limit       = search[1],
        min_release = 1990,
        order_popularity = False
    )    
    
    items = items.rename(columns={'id': 'item_id'})
    items['rating'] = items['item_id'].apply(lambda x: 4 if random.random() > 0.5 else 5)
    items['user_id'] = max_user_id
    items = items[['item_id', 'user_id', 'rating']]
    
    max_user_id +=1
    
    fun_user_interactions.append(items)

fun_user_interactions = pd.concat(fun_user_interactions)
fun_user_interactions['timestamp'] = datetime.now()

In [None]:
fun_user_interactions

In [None]:
client.add_interactions(fun_user_interactions)

In [None]:
api_interactions = ints_to_df(await ctx.interaction_service.find_all())
api_interactions.shape