# Models: Movie Tags Sentence Transformer

In [1]:
%load_ext autoreload
%autoreload 2
BASE_PATH='../../..'

In [2]:
BASE_PATH        = '../../..'
LIB_PATH         = f'{BASE_PATH}/lib'
DATASET_PATH     = f'{BASE_PATH}/datasets'
FIELD            = 'tags'
EMBEDDING_PATH   = f'{BASE_PATH}/datasets/movie_{FIELD}_embedding_bert.json'
COLLECTION_NAMES = [FIELD]

In [3]:
import sys
sys.path.append(LIB_PATH)

import numpy as np
import pandas as pd
import torch

import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dt

import util as ut

from context import AppContextFactory

from sentence_transformers import SentenceTransformer

2023-05-02 20:07:48.683323: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available(), torch.__version__, torch.cuda.get_arch_list()

(device(type='cuda', index=0), True, '1.11.0')

In [6]:
ut.set_seed(42)

## Carga de dataset

In [7]:
dataset = ds.MovieLensTMDBDataLoader.df_from_path(DATASET_PATH)

Select movies overview and add new curated tokens column:

In [8]:
columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title',
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.join_str_list, FIELD) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17611 entries, 0 to 17610
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            17611 non-null  int64 
 1   release_year  17611 non-null  int64 
 2   imdb_id       17611 non-null  int64 
 3   title         17611 non-null  string
 4   tags          17611 non-null  object
 5   tags_tokens   17611 non-null  object
dtypes: int64(3), object(2), string(1)
memory usage: 825.6+ KB


## Definicion del modelo

In [9]:
model = SentenceTransformer('all-roberta-large-v1')

2023-05-02 20:07:51,845 - INFO - Load pretrained SentenceTransformer: all-roberta-large-v1
2023-05-02 20:07:53,743 - INFO - Use pytorch device: cuda


## Generacion de embeddings

In [11]:
embeddings = model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(EMBEDDING_PATH)

del model

Batches:   0%|          | 0/551 [00:00<?, ?it/s]

## Evaluación

In [12]:
ctx = AppContextFactory(DATASET_PATH, COLLECTION_NAMES)

2023-05-02 20:08:47,061 - INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
2023-05-02 20:08:47,062 - INFO - Running Chroma using direct local API.
2023-05-02 20:08:47,073 - INFO - Successfully imported ClickHouse Connect C data optimizations
2023-05-02 20:08:47,073 - INFO - Successfully import ClickHouse Connect C/Numpy optimizations
2023-05-02 20:08:47,076 - INFO - Using python library for writing JSON byte strings
2023-05-02 20:08:48,003 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-05-02 20:08:48,132 - INFO - Use pytorch device: cuda


In [13]:
ctx.tag_item_recommender(n_sim_items=10) \
    .recommend(item_id=1, k=10) \
    .show()

Unnamed: 0,Similarity,Rating,.,Recommended Movies,..,Already seen movies,Rating.1
0,0.65,3.7,We Recommend ==>,,==> Because You Saw ==>,,3.7
1,0.59,4.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
2,0.58,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
3,0.55,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
4,0.53,3.5,We Recommend ==>,,==> Because You Saw ==>,,3.7
5,0.52,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
6,0.5,3.8,We Recommend ==>,,==> Because You Saw ==>,,3.7
7,0.49,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
8,0.48,4.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
