# Models: Movie Overview Sentence Transformer

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH        = '../../..'
LIB_PATH         = f'{BASE_PATH}/lib'
DATASET_PATH     = f'{BASE_PATH}/datasets'
FIELD            = 'overview'
MODEL            = 'all-mpnet-base-v2'
EMBEDDING_PATH   = f'{DATASET_PATH}/movie_{FIELD}_embedding_bert.json'
COLLECTION_NAMES = [FIELD]

In [3]:
import sys
sys.path.append(LIB_PATH)

import numpy as np
import pandas as pd
import torch

import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dt

import util as ut

from context import AppContextFactory

from sentence_transformers import SentenceTransformer

2023-05-02 20:19:53.899078: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available(), torch.__version__, torch.cuda.get_arch_list()

(device(type='cuda', index=0),
 True,
 '1.11.0',
 ['sm_37',
  'sm_50',
  'sm_60',
  'sm_61',
  'sm_70',
  'sm_75',
  'sm_80',
  'sm_86',
  'compute_37'])

In [6]:
ut.set_seed(42)

## Carga de dataset

In [7]:
dataset = ds.MovieLensTMDBDataLoader.df_from_path(DATASET_PATH)

Select movies overview and add new curated tokens column:

In [8]:
columns = ['movie_id', 'movie_release_year',  'movie_imdb_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title',
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17611 entries, 0 to 17610
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               17611 non-null  int64 
 1   release_year     17611 non-null  int64 
 2   imdb_id          17611 non-null  int64 
 3   title            17611 non-null  string
 4   overview         17611 non-null  string
 5   overview_tokens  17611 non-null  object
dtypes: int64(3), object(1), string(2)
memory usage: 825.6+ KB


## Definicion del modelo

In [9]:
model = SentenceTransformer(MODEL, device=pu.get_device())

2023-05-02 20:19:57,885 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2


## Generacion de embeddings

In [10]:
embeddings = model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

Batches:   0%|          | 0/551 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17611 entries, 0 to 17610
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  17611 non-null  int64 
 1   release_year        17611 non-null  int64 
 2   imdb_id             17611 non-null  int64 
 3   title               17611 non-null  string
 4   overview            17611 non-null  string
 5   overview_tokens     17611 non-null  object
 6   overview_embedding  17611 non-null  object
dtypes: int64(3), object(2), string(2)
memory usage: 963.2+ KB


## Evaluación

In [11]:
ctx = AppContextFactory(DATASET_PATH, COLLECTION_NAMES)

2023-05-02 20:20:20,220 - INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
2023-05-02 20:20:20,221 - INFO - Running Chroma using direct local API.
2023-05-02 20:20:20,231 - INFO - Successfully imported ClickHouse Connect C data optimizations
2023-05-02 20:20:20,232 - INFO - Successfully import ClickHouse Connect C/Numpy optimizations
2023-05-02 20:20:20,234 - INFO - Using python library for writing JSON byte strings
2023-05-02 20:20:20,942 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-05-02 20:20:21,079 - INFO - Use pytorch device: cuda


In [12]:
ctx.overview_item_recommender(n_sim_items=10) \
    .recommend(item_id=1, k=5) \
    .show()

Unnamed: 0,Similarity,Rating,.,Recommended Movies,..,Already seen movies,Rating.1
0,0.67,3.9,We Recommend ==>,,==> Because You Saw ==>,,3.7
1,0.63,3.6,We Recommend ==>,,==> Because You Saw ==>,,3.7
2,-0.04,2.6,We Recommend ==>,,==> Because You Saw ==>,,3.7
3,-0.04,3.4,We Recommend ==>,,==> Because You Saw ==>,,3.7
4,-0.05,3.6,We Recommend ==>,,==> Because You Saw ==>,,3.7
