# Models: Movie Tags Sentence Transformer

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
import torch

import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dt

import random

import recommender as rc

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

In [11]:
FIELD = 'tags'
EMBEDDING_PATH = f'../datasets/movie_{FIELD}_embedding_bert.json'

## Carga de dataset

In [12]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2005) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191540 entries, 0 to 191539
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      191540 non-null  int64         
 1   user_seq                     191540 non-null  int64         
 2   user_movie_tags              191540 non-null  object        
 3   user_movie_rating            191540 non-null  int64         
 4   user_movie_rating_timestamp  191540 non-null  datetime64[ns]
 5   user_movie_rating_year       191540 non-null  int64         
 6   movie_id                     191540 non-null  int64         
 7   movie_seq                    191540 non-null  int64         
 8   movie_title                  191540 non-null  string        
 9   movie_genres                 191540 non-null  object        
 10  movie_for_adults             191540 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [14]:
columns = ['movie_id', 'movie_release_year', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.join_str_list, FIELD) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18515 non-null  int64 
 1   movie_release_year  18515 non-null  int64 
 2   title               18515 non-null  string
 3   tags                18515 non-null  object
 4   tags_tokens         18515 non-null  object
dtypes: int64(2), object(2), string(1)
memory usage: 723.4+ KB


In [17]:
movie_data.head()

Unnamed: 0,id,movie_release_year,title,tags,tags_tokens
0,1,1995,Toy Story,2009 reissue in stereoscopic 3-d 3d 55 movies ...,reissue stereoscopic d 3d movies kid entertain...
1,2355,1998,"Bug's Life, A",acting animated animation ant ant-hill ants av...,acting animated animation ant ant hill ants av...
2,3114,1999,Toy Story 2,2009 reissue in stereoscopic 3-d 3d abandonmen...,reissue stereoscopic d 3d abandonment airplane...
3,4306,2001,Shrek,3 adventure almost favorite andrew adamson ani...,adventure favorite andrew adamson animated ani...
4,4886,2001,"Monsters, Inc.",3 andrew stanton animated animation bechdel te...,andrew stanton animated animation bechdel test...


## Definicion del modelo

In [18]:
from sentence_transformers import SentenceTransformer

# 'all-MiniLM-L6-v2'
# 'bert-base-nli-mean-tokens'

sbert_model = SentenceTransformer('all-mpnet-base-v2')

2022-07-25 22:22:36,824 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2
2022-07-25 22:22:37,402 - INFO - Use pytorch device: cuda


## Generacion de embeddings

In [19]:
embeddings = sbert_model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

Batches:   0%|          | 0/579 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18515 non-null  int64 
 1   movie_release_year  18515 non-null  int64 
 2   title               18515 non-null  string
 3   tags                18515 non-null  object
 4   tags_tokens         18515 non-null  object
 5   tags_embedding      18515 non-null  object
dtypes: int64(2), object(3), string(1)
memory usage: 868.0+ KB


## Evaluación

In [20]:
FIELD

In [21]:
EMBEDDING_PATH

In [22]:
recommender = rc.DistanceMatrixRecommender(
    df      = pd.read_json(EMBEDDING_PATH),
    column  = f'{FIELD}_embedding', 
    device  = get_device()
)

Building Distances Matrix:   0%|          | 0/18515 [00:00<?, ?it/s]

In [23]:
result = recommender.recommend(item_index=0, k=50)
result.show()


Recommender: tags
Item


Unnamed: 0,id,title
0,1,Toy Story


Recommendations


Unnamed: 0,index,distance,id,title,tags
0,0,0.0,1,Toy Story,2009 reissue in stereoscopic 3-d 3d 55 movies ...
1,2,0.134306,3114,Toy Story 2,2009 reissue in stereoscopic 3-d 3d abandonmen...
2,4,0.166468,4886,"Monsters, Inc.",3 andrew stanton animated animation bechdel te...
3,566,0.166835,6377,Finding Nemo,55 movies every kid should see--entertainment ...
4,2854,0.172565,596,Pinocchio,(s)vcd 2d animation 55 movies every kid should...
5,25,0.176772,108932,The Lego Movie,55 movies every kid should see--entertainment ...
6,16,0.187256,60069,WALL·E,55 movies every kid should see--entertainment ...
7,2516,0.18781,364,"Lion King, The",2d animation 55 movies every kid should see--e...
8,2037,0.195428,72226,Fantastic Mr. Fox,55 movies every kid should see--entertainment ...
9,1368,0.203766,2987,Who Framed Roger Rabbit?,*reps* 2d animation 55 movies every kid should...
