# Models: Movie Title Sentence Transformer

In [1]:
%load_ext autoreload
%autoreload 2
BASE_PATH='../../..'

In [2]:
import sys
sys.path.append(f'{BASE_PATH}/lib')

import numpy as np
import pandas as pd
import torch

import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dt

import random

import recommender as rc

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

In [11]:
FIELD = 'title'
EMBEDDING_PATH = f'{BASE_PATH}/datasets/movie_{FIELD}_embedding_bert.json'

## Carga de dataset

In [41]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    path             = f'{BASE_PATH}/datasets',
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2005) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191540 entries, 0 to 191539
Data columns (total 16 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      191540 non-null  int64         
 1   user_seq                     191540 non-null  int64         
 2   user_movie_tags              191540 non-null  object        
 3   user_movie_rating            191540 non-null  int64         
 4   user_movie_rating_timestamp  191540 non-null  datetime64[ns]
 5   user_movie_rating_year       191540 non-null  int64         
 6   movie_id                     191540 non-null  int64         
 7   movie_seq                    191540 non-null  int64         
 8   movie_title                  191540 non-null  string        
 9   movie_genres                 191540 non-null  object        
 10  movie_for_adults             191540 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [42]:
columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            18515 non-null  int64 
 1   release_year  18515 non-null  int64 
 2   imdb_id       18515 non-null  int64 
 3   title         18515 non-null  string
 4   title_tokens  18515 non-null  object
dtypes: int64(3), object(1), string(1)
memory usage: 723.4+ KB


## Definicion del modelo

In [43]:
from sentence_transformers import SentenceTransformer

# 'all-MiniLM-L6-v2'
# 'bert-base-nli-mean-tokens'
columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()
sbert_model = SentenceTransformer('all-mpnet-base-v2')

2022-07-28 23:28:24,435 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2022-07-28 23:28:24,435 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2022-07-28 23:28:24,972 - INFO - Use pytorch device: cuda
2022-07-28 23:28:24,972 - INFO - Use pytorch device: cuda


## Generacion de embeddings

In [44]:
embeddings = sbert_model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

Batches:   0%|          | 0/579 [00:00<?, ?it/s]

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg opt

## Evaluación

In [45]:
FIELD

In [46]:
EMBEDDING_PATH

In [12]:
recommender = rc.DistanceMatrixRecommender(
    df      = pd.read_json(EMBEDDING_PATH),
    column  = f'{FIELD}_embedding', 
    device  = get_device()
)

Building Distances Matrix:   0%|          | 0/18515 [00:00<?, ?it/s]

In [13]:
result = recommender.recommend(item_index=0, k=5)
result.show()


Recommender: title
Item


Unnamed: 0,id,title,imdb_id,image
0,1,Toy Story,114709,


Recommendations


Unnamed: 0,index,distance,id,title,imdb_id,image
0,0,0.0,1,Toy Story,114709,
1,2,0.066865,3114,Toy Story 2,120363,
2,19,0.080091,78499,Toy Story 3,435761,
3,9675,0.186109,120474,Toy Story That Time Forgot,3473654,
4,8306,0.20797,106022,Toy Story of Terror,2446040,
