# Recommender ChatBot: Load items and interactions to API

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

BASE_PATH    = '../..'
LIB_PATH     = f'{BASE_PATH}/lib'
API_PATH     = f'{BASE_PATH}/chat-bot-api'
DATASET_PATH = f'{BASE_PATH}/datasets'
ITEMS_PATH   = f'{DATASET_PATH}/chatbot-api-movies.json'


os.environ['TMP_PATH']               = f'{BASE_PATH}/tmp'
os.environ['DATASET_PATH']           = f'{BASE_PATH}/datasets'
os.environ['WEIGHTS_PATH']           = f'{BASE_PATH}/weights'
os.environ['METRICS_PATH']           = f'{BASE_PATH}/metrics'
os.environ['MONGODB_URL']            = 'mongodb://0.0.0.0:27017'
os.environ['MONGODB_DATABASE']       = 'chatbot'
os.environ['CHROMA_HOST']            = '0.0.0.0'
os.environ['CHROMA_PORT']            = '9090'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [208]:
import sys
sys.path.append(LIB_PATH)
sys.path.append(API_PATH)

import pandas as pd

import util as ut

import torch
import data as dt
import data.dataset as ds

import os

from rest import RecChatBotV1ApiClient

from services import ItemSimQuery

from app_context import AppContext

import pytorch_common.util as pu
from datetime import datetime

# Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

# Common Functions and Classes

In [5]:
def fetch_items_data(df, id_col='movie_id', title_col='movie_title'):
    from rest import TMDBApiClient

    data = {}
    for idx, row  in df.iterrows():
        client = TMDBApiClient()
        data[row[id_col]] = client.find_movies_by(row[title_col])
    
    return data

def resolve_overview(data, row):
    if row['movie_id'] in data:
        movies = data[row['movie_id']]
        if len(movies) > 0:
            previous_title_match = False
            for movie in movies:
                if previous_title_match == False:
                    title_match = row['movie_title'].strip().lower() == movie['title'].strip().lower() or row['movie_title'].strip().lower() == movie['original_title'].strip().lower()
                
                if (title_match or previous_title_match) and len(movie['overview']) > 0:
                    return movie['overview']

                previous_title_match = title_match
                
    return row['movie_overview']

In [6]:
def items_to_df(models):
    return pd.DataFrame([
        {
            'id'          : model.id,
            'title'       : model.title,
            'release'     : model.release,
            'genres'      : model.genres,
            'rating'      : model.rating,
            'poster'      : model.poster
        }
        for model in models
    ])


def ints_to_df(models):
    return pd.DataFrame([
        {
            'item_id' : model.user_id,
            'user_id' : model.item_id,
            'rating'  : model.rating
        }
        for model in models
    ])

In [129]:
async def append_popularity_score(items, interactions):
    from sklearn.preprocessing import MinMaxScaler

    item_votes = interactions \
        .groupby(['item_id'])['user_id'] \
        .size() \
        .reset_index(name='votes') \
        .rename(columns={'item_id': 'id'})

    
    item_votes['id'] = item_votes['id'].astype(str)

    items = item_votes.merge(items, on='id')
    
    items['votes_norm']       = items['votes'] / items['votes'].max()
    items['popularity_score'] = items['rating'] * items['votes_norm']
    items = items.drop(['votes_norm'], axis=1)

    return items.sort_values(
        by        = ['popularity_score'],
        ascending = False
    )

In [130]:
from abc import ABC, abstractmethod
from IPython.core.display import HTML

def to_image_html(path, width=300, alt='Not Found Image'): return F'<img src="{path}" width="{width}" alt={alt} >'

def show(df): display(HTML(df.to_html(escape=False)))

In [131]:
async def search_items_by_content(
    interactions,
    content,
    min_release      = None,
    min_rating       = None,
    order_popularity = True,
    limit            = 10_000
):
    items, _ = await ctx.item_service.find_similars_by(
        ItemSimQuery() \
            .contains(content) \
            .rating_gte(min_rating) \
            .release_gte(min_release) \
            .limit_eq(limit)
    )
    
    items = items_to_df(items)
                
    if order_popularity:
        items = await append_popularity_score(items, interactions)

    items['poster'] = items['poster'].apply(to_image_html)
    
    return items.reset_index()

# Load Dataset

In [11]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

features_fn = lambda obs, device: to_tensor(obs, device, ['user_seq', 'movie_seq'])
target_fn   = lambda obs, device: to_tensor(obs, device, ['user_movie_rating'])

In [12]:
dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    path             = DATASET_PATH,
    transform        = features_fn,
    target_transform = target_fn,
    device           = torch.device('cpu'),
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2004)]
)

In [14]:
if not os.path.exists(ITEMS_PATH):
    items = dataset \
        .data[['movie_id', 'movie_title', 'movie_genres', 'movie_overview', 'movie_release_year', 'movie_imdb_id', 'user_movie_rating']]

    item_mean_rating = items.groupby(['movie_id'])['user_movie_rating'].mean().reset_index()
    items = items.drop_duplicates(subset=['movie_id']).drop(columns=['user_movie_rating'])
    items[items['movie_title'] == 'Paths of Glory']
    items = items.merge(item_mean_rating, on='movie_id')
    items

## Populate items poster

In [15]:
if not os.path.exists(ITEMS_PATH):
    from rest import TMDBApiClient
    import logging

    client = TMDBApiClient()

    with dt.progress_bar(items.shape[0], title='Processing') as pb:
        def resolve_url(title):
            return (title, client.find_poster_by(title))

        params = [[row['movie_title']] for _, row in items.iterrows()]

        poster_by_id = ut.ParallelExecutor()(
            resolve_url,
            params = params,
            fallback_result = None
        )

        poster_by_id = {item[0]: item[1] for item in poster_by_id}

In [16]:
if not os.path.exists(ITEMS_PATH):
    items['poster'] = items['movie_title'].apply(lambda title: poster_by_id[title])
    ut.save_df(items, f'{DATASET_PATH}/chatbot-api-movies.json')

## Load builded items

In [17]:
items = ut.load_df(f'{DATASET_PATH}/chatbot-api-movies.json')

In [18]:
pd.set_option('display.max_colwidth', None)
items[['movie_title', 'poster']].head()

Unnamed: 0,movie_title,poster
0,Toy Story,http://image.tmdb.org/t/p/w500/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,Clueless,http://image.tmdb.org/t/p/w500/8AwVTcgpTnmeOs4TdTWqcFDXEsA.jpg
2,While You Were Sleeping,http://image.tmdb.org/t/p/w500/qNGO3ETcNwlWqK2kNRpbJSJRlos.jpg
3,Forrest Gump,http://image.tmdb.org/t/p/w500/arw2vcBveWOVZr6pxd9XTd1TdQa.jpg
4,Pretty Woman,http://image.tmdb.org/t/p/w500/hVHUfT801LQATGd26VPzhorIYza.jpg


In [19]:
interactions = dataset \
    .data[['user_id', 'movie_id', 'user_movie_rating', 'user_movie_rating_timestamp']] \
    .rename(columns={
        'user_movie_rating'           : 'rating',
        'movie_id'                    : 'item_id',
        'user_movie_rating_timestamp' : 'timestamp'
    })

interactions.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,100538,1,4,2019-01-01 21:11:09
496,100538,39,3,2019-01-03 11:58:44
8785,100538,339,3,2019-01-04 08:18:24
9194,100538,356,4,2019-01-03 10:37:28
14189,100538,597,4,2019-01-03 11:55:57


In [20]:
incomplete_items = items[((items['movie_overview'] == 'No overview found.') | (items['movie_overview'] == ''))]
incomplete_items.shape

(127, 8)

In [21]:
data = fetch_items_data(incomplete_items)

items['movie_overview'] = items.apply(lambda row: resolve_overview(data, row), axis=1) 

In [22]:
incomplete_items = items[((items['movie_overview'] == 'No overview found.') | (items['movie_overview'] == ''))]
incomplete_items.shape

(31, 8)

## Upload item and interaction to RecChatBotAPI

In [23]:
ctx = AppContext()

2024-02-18 18:51:09,431 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2024-02-18 18:51:09,982 - INFO - Use pytorch device: cuda
2024-02-18 18:51:09,984 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-02-18 18:51:10,003 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [24]:
client = RecChatBotV1ApiClient()

In [25]:
client.add_items(items)

2024-02-18 18:51:10,081 - INFO - Page Size: 1000
2024-02-18 18:51:15,886 - INFO - Page: 1/19, Size: 1000
2024-02-18 18:51:21,472 - INFO - Page: 2/19, Size: 1000
2024-02-18 18:51:27,154 - INFO - Page: 3/19, Size: 1000
2024-02-18 18:51:33,478 - INFO - Page: 4/19, Size: 1000
2024-02-18 18:51:39,135 - INFO - Page: 5/19, Size: 1000
2024-02-18 18:51:44,942 - INFO - Page: 6/19, Size: 1000
2024-02-18 18:51:50,614 - INFO - Page: 7/19, Size: 1000
2024-02-18 18:51:56,303 - INFO - Page: 8/19, Size: 1000
2024-02-18 18:52:02,727 - INFO - Page: 9/19, Size: 1000
2024-02-18 18:52:08,501 - INFO - Page: 10/19, Size: 1000
2024-02-18 18:52:14,448 - INFO - Page: 11/19, Size: 1000
2024-02-18 18:52:21,250 - INFO - Page: 12/19, Size: 1000
2024-02-18 18:52:27,344 - INFO - Page: 13/19, Size: 1000
2024-02-18 18:52:33,220 - INFO - Page: 14/19, Size: 1000
2024-02-18 18:52:39,905 - INFO - Page: 15/19, Size: 1000
2024-02-18 18:52:45,691 - INFO - Page: 16/19, Size: 1000
2024-02-18 18:52:51,663 - INFO - Page: 17/19, Si

[]

In [26]:
interactions.shape

(194273, 4)

In [27]:
client.add_interactions(interactions)

2024-02-18 18:53:01,506 - INFO - Page Size: 5000
2024-02-18 18:53:01,783 - INFO - Page: 1/39, Size: 5000
2024-02-18 18:53:02,226 - INFO - Page: 2/39, Size: 5000
2024-02-18 18:53:02,500 - INFO - Page: 3/39, Size: 5000
2024-02-18 18:53:02,778 - INFO - Page: 4/39, Size: 5000
2024-02-18 18:53:03,040 - INFO - Page: 5/39, Size: 5000
2024-02-18 18:53:03,316 - INFO - Page: 6/39, Size: 5000
2024-02-18 18:53:03,582 - INFO - Page: 7/39, Size: 5000
2024-02-18 18:53:03,850 - INFO - Page: 8/39, Size: 5000
2024-02-18 18:53:04,138 - INFO - Page: 9/39, Size: 5000
2024-02-18 18:53:04,430 - INFO - Page: 10/39, Size: 5000
2024-02-18 18:53:04,708 - INFO - Page: 11/39, Size: 5000
2024-02-18 18:53:05,155 - INFO - Page: 12/39, Size: 5000
2024-02-18 18:53:05,425 - INFO - Page: 13/39, Size: 5000
2024-02-18 18:53:05,696 - INFO - Page: 14/39, Size: 5000
2024-02-18 18:53:05,957 - INFO - Page: 15/39, Size: 5000
2024-02-18 18:53:06,240 - INFO - Page: 16/39, Size: 5000
2024-02-18 18:53:06,509 - INFO - Page: 17/39, Si

[]

In [28]:
api_interactions = ints_to_df(await ctx.interaction_service.find_all())
api_interactions.shape

(194273, 3)

## Fun users interactions

In [200]:
searchs = [
    ('pixar animated movie for children', 200),
    ('science fiction, action', 100),
    ('war weapons', 70),
    ('iron man, x-men, spider man, bat man, flash, avengers, ant-man, hulk, guardians of the galaxy, marvel, green lantern, superman, watchmen, thor, deadpool, wonder woman, strange, justice League, captain america, logans, kick ass, John Wick', 150),
    ('sci-fi, action, future', 130),
    ('Mission Impossible, spies, bourne identity, 007', 70),
    ('comedy movies', 200),
    ('horror', 100),
    ('thiller, suspense', 150),
    ('love, romance', 150),
    ('time travel', 100),
    ('dead, zombies, post apocalyptic', 100),
    ('software', 100),
    ('hackers', 100)
]

In [179]:
items = await search_items_by_content(
    interactions, 
    'hackers',
    min_rating  = 3, 
    limit       = 50,
    min_release = 1990,
    order_popularity = False
)


# show(items)

In [210]:
fun_user_interactions = []
max_user_id = interactions['user_id'].max()

for search in searchs:    
    items = await search_items_by_content(
        interactions, 
        search[0],
        min_rating  = 3, 
        limit       = search[1],
        min_release = 1990,
        order_popularity = False
    )    
    
    items = items.rename(columns={'id': 'item_id'})
    items['rating'] = items['item_id'].apply(lambda x: 4 if random.random() > 0.5 else 5)
    items['user_id'] = max_user_id
    items = items[['item_id', 'user_id', 'rating']]
    
    max_user_id +=1
    
    fun_user_interactions.append(items)

fun_user_interactions = pd.concat(fun_user_interactions)
fun_user_interactions['timestamp'] = datetime.now()

In [211]:
fun_user_interactions

Unnamed: 0,item_id,user_id,rating,timestamp
0,1,162521,5,2024-02-18 20:34:01.572898
1,101262,162521,4,2024-02-18 20:34:01.572898
2,105468,162521,4,2024-02-18 20:34:01.572898
3,106022,162521,5,2024-02-18 20:34:01.572898
4,106423,162521,5,2024-02-18 20:34:01.572898
...,...,...,...,...
65,8628,162534,5,2024-02-18 20:34:01.572898
66,89337,162534,4,2024-02-18 20:34:01.572898
67,92475,162534,4,2024-02-18 20:34:01.572898
68,94130,162534,4,2024-02-18 20:34:01.572898


In [212]:
client.add_interactions(fun_user_interactions)

2024-02-18 20:34:28,890 - INFO - Page Size: 5000
2024-02-18 20:34:28,988 - INFO - Page: 1/1, Size: 1158


[]

In [213]:
api_interactions = ints_to_df(await ctx.interaction_service.find_all())
api_interactions.shape

(195431, 3)