# Models: Movie Overview Sentence Transformer

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH        = '../../..'
LIB_PATH         = f'{BASE_PATH}/lib'
DATASET_PATH     = f'{BASE_PATH}/datasets'
MODEL            = 'all-mpnet-base-v2'
FIELD            = 'overview'

In [3]:
import sys
sys.path.append(LIB_PATH)

import torch
import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import data as dt
import data.dataset as ds

import util as ut

from sentence_transformers import SentenceTransformer

from recommender import ItemRecommenderBuilder, item_rec_sys_cfg

2023-05-03 22:58:09.008863: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using embedded DuckDB without persistence: data will be transient


<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available(), torch.__version__, torch.cuda.get_arch_list()

(device(type='cuda', index=0),
 True,
 '1.11.0',
 ['sm_37',
  'sm_50',
  'sm_60',
  'sm_61',
  'sm_70',
  'sm_75',
  'sm_80',
  'sm_86',
  'compute_37'])

In [6]:
ut.set_seed(42)

In [7]:
REC_SYS_CFG = item_rec_sys_cfg(DATASET_PATH, FIELD, MODEL)

## Preprocesamiento

In [8]:
dataset = ds.MovieLensTMDBDataLoader.df_from_path(DATASET_PATH)

columns = ['movie_id', 'movie_release_year',  'movie_imdb_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title',
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

## Generacion de embeddings

In [9]:
model = SentenceTransformer(MODEL)

embeddings = model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(REC_SYS_CFG.file_path)

del model

2023-05-03 22:58:12,773 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-05-03 22:58:13,458 - INFO - Use pytorch device: cuda


Batches:   0%|          | 0/551 [00:00<?, ?it/s]

## Evaluación

In [10]:
builder = ItemRecommenderBuilder(DATASET_PATH, [REC_SYS_CFG])

2023-05-03 22:58:34,427 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-05-03 22:58:34,548 - INFO - Use pytorch device: cuda


In [11]:
builder.item_recommender(REC_SYS_CFG.name, n_sim_items = 10) \
    .recommend(item_id=1, k=10) \
    .show()

Unnamed: 0,Similarity,Rating,.,Recommended Movies,..,Already seen movies,Rating.1
0,0.67,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
1,0.63,3.6,We Recommend ==>,,==> Because You Saw ==>,,3.7
2,-0.04,2.6,We Recommend ==>,,==> Because You Saw ==>,,3.7
3,-0.04,3.4,We Recommend ==>,,==> Because You Saw ==>,,3.7
4,-0.05,3.6,We Recommend ==>,,==> Because You Saw ==>,,3.7
5,-0.06,3.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
6,-0.07,3.8,We Recommend ==>,,==> Because You Saw ==>,,3.7
7,-0.1,4.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
8,-0.1,3.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
