# Models: Movie Genres Sentence Transformer

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH        = '../../..'
LIB_PATH         = f'{BASE_PATH}/lib'
DATASET_PATH     = f'{BASE_PATH}/datasets'
FIELD            = 'genres'
EMBEDDING_PATH   = f'{BASE_PATH}/datasets/movie_{FIELD}_embedding_bert.json'
COLLECTION_NAMES = [FIELD]

In [3]:
import sys
sys.path.append(f'{BASE_PATH}/lib')

import numpy as np
import pandas as pd
import torch

import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dt

import util as ut

from context import AppContextFactory

2023-05-02 19:57:25.365354: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
pu.set_device_name('gpu')

pu.get_device(), torch.cuda.is_available(), torch.__version__

(device(type='cuda', index=0), True, '1.11.0')

In [6]:
ut.set_seed(42)

## Carga de dataset

In [7]:
dataset = ds.MovieLensTMDBDataLoader.df_from_path(DATASET_PATH)

Select movies overview and add new curated tokens column:

In [8]:
columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title',
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.join_str_list, FIELD) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17611 entries, 0 to 17610
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             17611 non-null  int64 
 1   release_year   17611 non-null  int64 
 2   imdb_id        17611 non-null  int64 
 3   title          17611 non-null  string
 4   genres         17611 non-null  object
 5   genres_tokens  17611 non-null  object
dtypes: int64(3), object(2), string(1)
memory usage: 825.6+ KB


In [9]:
movie_data.head()

Unnamed: 0,id,release_year,imdb_id,title,genres,genres_tokens
0,1,1995,114709,Toy Story,Adventure Animation Children Comedy Fantasy,adventure animation children comedy fantasy
1,2355,1998,120623,"Bug's Life, A",Adventure Animation Children Comedy,adventure animation children comedy
2,3114,1999,120363,Toy Story 2,Adventure Animation Children Comedy Fantasy,adventure animation children comedy fantasy
3,4306,2001,126029,Shrek,Adventure Animation Children Comedy Fantasy Ro...,adventure animation children comedy fantasy ro...
4,4886,2001,198781,"Monsters, Inc.",Adventure Animation Children Comedy Fantasy,adventure animation children comedy fantasy


## Definicion del modelo

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-roberta-large-v1')

2023-05-02 19:57:27,294 - INFO - Load pretrained SentenceTransformer: all-roberta-large-v1
2023-05-02 19:57:29,186 - INFO - Use pytorch device: cuda


## Generacion de embeddings

In [11]:
embeddings = model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

Batches:   0%|          | 0/551 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17611 entries, 0 to 17610
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                17611 non-null  int64 
 1   release_year      17611 non-null  int64 
 2   imdb_id           17611 non-null  int64 
 3   title             17611 non-null  string
 4   genres            17611 non-null  object
 5   genres_tokens     17611 non-null  object
 6   genres_embedding  17611 non-null  object
dtypes: int64(3), object(3), string(1)
memory usage: 963.2+ KB


## Evaluación

In [12]:
ctx = AppContextFactory(DATASET_PATH, COLLECTION_NAMES)

2023-05-02 19:57:38,110 - INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
2023-05-02 19:57:38,110 - INFO - Running Chroma using direct local API.
2023-05-02 19:57:38,120 - INFO - Successfully imported ClickHouse Connect C data optimizations
2023-05-02 19:57:38,120 - INFO - Successfully import ClickHouse Connect C/Numpy optimizations
2023-05-02 19:57:38,123 - INFO - Using python library for writing JSON byte strings
2023-05-02 19:57:39,025 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-05-02 19:57:39,153 - INFO - Use pytorch device: cuda


In [18]:
ctx.genre_item_recommender(n_sim_items=20) \
    .recommend(item_id=1, k=10) \
    .show()

Unnamed: 0,Similarity,Rating,.,Recommended Movies,..,Already seen movies,Rating.1
0,1.0,3.6,We Recommend ==>,,==> Because You Saw ==>,,3.7
1,1.0,3.8,We Recommend ==>,,==> Because You Saw ==>,,3.7
2,1.0,3.4,We Recommend ==>,,==> Because You Saw ==>,,3.7
3,1.0,2.7,We Recommend ==>,,==> Because You Saw ==>,,3.7
4,1.0,1.8,We Recommend ==>,,==> Because You Saw ==>,,3.7
5,1.0,1.5,We Recommend ==>,,==> Because You Saw ==>,,3.7
6,1.0,2.5,We Recommend ==>,,==> Because You Saw ==>,,3.7
7,1.0,3.2,We Recommend ==>,,==> Because You Saw ==>,,3.7
8,1.0,3.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
9,1.0,3.0,We Recommend ==>,,==> Because You Saw ==>,,3.7
