# Models: Movie Tags Sentence Transformer

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
BASE_PATH        = '../../..'
LIB_PATH         = f'{BASE_PATH}/lib'
DATASET_PATH     = f'{BASE_PATH}/datasets'
MODEL            = 'all-roberta-large-v1'
FIELD            = 'tags'

In [17]:
import sys
sys.path.append(LIB_PATH)

import torch
import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import data as dt
import data.dataset as ds

import util as ut

from sentence_transformers import SentenceTransformer

from recommender import ItemRecommenderBuilder, item_rec_sys_cfg

## Setup

In [18]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [19]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available(), torch.__version__, torch.cuda.get_arch_list()

(device(type='cuda', index=0),
 True,
 '1.11.0',
 ['sm_37',
  'sm_50',
  'sm_60',
  'sm_61',
  'sm_70',
  'sm_75',
  'sm_80',
  'sm_86',
  'compute_37'])

In [20]:
ut.set_seed(42)

In [21]:
REC_SYS_CFG = item_rec_sys_cfg(DATASET_PATH, FIELD, MODEL)

2023-05-04 15:06:16,034 - INFO - Cfg:

embedding_col: tags_embedding
file_path: ../../../datasets/tags-all-roberta-large-v1.json
metadata_cols:
- tags
- release_year
- imdb_id
- tags_tokens
- title
name: tags-all-roberta-large-v1

2023-05-04 15:06:16,034 - INFO - Cfg:

embedding_col: tags_embedding
file_path: ../../../datasets/tags-all-roberta-large-v1.json
metadata_cols:
- tags
- release_year
- imdb_id
- tags_tokens
- title
name: tags-all-roberta-large-v1



## Preprocesamiento

In [22]:
dataset = ds.MovieLensTMDBDataLoader.df_from_path(DATASET_PATH)

columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title',
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.join_str_list, FIELD) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

## Generacion de embeddings

In [23]:
model = SentenceTransformer(MODEL)

embeddings = model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(REC_SYS_CFG.file_path)

del model

2023-05-04 15:06:19,121 - INFO - Load pretrained SentenceTransformer: all-roberta-large-v1
2023-05-04 15:06:19,121 - INFO - Load pretrained SentenceTransformer: all-roberta-large-v1
2023-05-04 15:06:21,111 - INFO - Use pytorch device: cuda
2023-05-04 15:06:21,111 - INFO - Use pytorch device: cuda


Batches:   0%|          | 0/551 [00:00<?, ?it/s]

## Evaluación

In [24]:
builder = ItemRecommenderBuilder(DATASET_PATH, [REC_SYS_CFG])

In [25]:
builder.item_recommender(REC_SYS_CFG.name, n_sim_items = 10) \
    .recommend(item_id=1, k=10) \
    .show()

2023-05-04 15:07:03,479 - INFO - Found 1 items by ids: [1].
2023-05-04 15:07:03,479 - INFO - Found 1 items by ids: [1].
2023-05-04 15:07:03,482 - INFO - Found 10 similar to 1 item.
2023-05-04 15:07:03,482 - INFO - Found 10 similar to 1 item.


Item Recommender: tags-all-roberta-large-v1



Unnamed: 0,Similarity,Rating,.,Recommended Movies,..,Already seen movies,Rating.1
0,0.65,3.7,We Recommend ==>,,==> Because You Saw ==>,,3.7
1,0.59,4.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
2,0.58,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
3,0.55,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
4,0.53,3.5,We Recommend ==>,,==> Because You Saw ==>,,3.7
5,0.52,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
6,0.5,3.8,We Recommend ==>,,==> Because You Saw ==>,,3.7
7,0.49,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
8,0.48,4.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
