# Models: Movie Title Sentence Transformer

In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
import torch

import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dt

import random

import recommender as rc

## Setup

In [29]:
pu.LoggerBuilder().on_console().build()

In [30]:
pu.set_device_name('gpu')

In [31]:
pu.get_device()

In [32]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [33]:
torch.cuda.is_available()

In [34]:
torch.__version__

In [35]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [36]:
set_seed(42)

In [37]:
FIELD = 'title'
EMBEDDING_PATH = f'../datasets/movie_{FIELD}_embedding_bert.json'

## Carga de dataset

In [38]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2005) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191540 entries, 0 to 191539
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      191540 non-null  int64         
 1   user_seq                     191540 non-null  int64         
 2   user_movie_tags              191540 non-null  object        
 3   user_movie_rating            191540 non-null  int64         
 4   user_movie_rating_timestamp  191540 non-null  datetime64[ns]
 5   user_movie_rating_year       191540 non-null  int64         
 6   movie_id                     191540 non-null  int64         
 7   movie_seq                    191540 non-null  int64         
 8   movie_title                  191540 non-null  string        
 9   movie_genres                 191540 non-null  object        
 10  movie_for_adults             191540 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [40]:
columns = ['movie_id', 'movie_release_year', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18515 non-null  int64 
 1   movie_release_year  18515 non-null  int64 
 2   title               18515 non-null  string
 3   title_tokens        18515 non-null  object
dtypes: int64(2), object(1), string(1)
memory usage: 578.7+ KB


## Definicion del modelo

In [41]:
from sentence_transformers import SentenceTransformer

# 'all-MiniLM-L6-v2'
# 'bert-base-nli-mean-tokens'

sbert_model = SentenceTransformer('all-mpnet-base-v2')

2022-07-25 22:11:37,175 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2022-07-25 22:11:37,175 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2022-07-25 22:11:37,659 - INFO - Use pytorch device: cuda
2022-07-25 22:11:37,659 - INFO - Use pytorch device: cuda


## Generacion de embeddings

In [42]:
embeddings = sbert_model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

Batches:   0%|          | 0/579 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18515 non-null  int64 
 1   movie_release_year  18515 non-null  int64 
 2   title               18515 non-null  string
 3   title_tokens        18515 non-null  object
 4   title_embedding     18515 non-null  object
dtypes: int64(2), object(2), string(1)
memory usage: 723.4+ KB


## Evaluación

In [43]:
FIELD

In [44]:
EMBEDDING_PATH

In [45]:
recommender = rc.DistanceMatrixRecommender(
    df      = pd.read_json(EMBEDDING_PATH),
    column  = f'{FIELD}_embedding', 
    device  = get_device()
)

Building Distances Matrix:   0%|          | 0/18515 [00:00<?, ?it/s]

In [46]:
result = recommender.recommend(item_index=0, k=50)
result.show()


Recommender: title
Item


Unnamed: 0,id,title
0,1,Toy Story


Recommendations


Unnamed: 0,index,distance,id,title,title.1
0,0,0.0,1,Toy Story,Toy Story
1,2,0.066865,3114,Toy Story 2,Toy Story 2
2,19,0.080091,78499,Toy Story 3,Toy Story 3
3,9675,0.186109,120474,Toy Story That Time Forgot,Toy Story That Time Forgot
4,8306,0.20797,106022,Toy Story of Terror,Toy Story of Terror
5,1625,0.249158,5843,Toy Soldiers,Toy Soldiers
6,25,0.255685,108932,The Lego Movie,The Lego Movie
7,9001,0.305101,3086,Babes in Toyland,Babes in Toyland
8,5139,0.305126,2017,Babes in Toyland,Babes in Toyland
9,46,0.313763,356,Forrest Gump,Forrest Gump
