# Models: Movie Title Sparce Ao

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
import torch

import pytorch_common.util as pu
from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dt

import random

import recommender as rc

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

In [11]:
FIELD = 'overview'
EMBEDDING_PATH = f'../datasets/movie_{FIELD}_embedding_bert.json'

## Carga de dataset

In [12]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2005) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191540 entries, 0 to 191539
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      191540 non-null  int64         
 1   user_seq                     191540 non-null  int64         
 2   user_movie_tags              191540 non-null  object        
 3   user_movie_rating            191540 non-null  int64         
 4   user_movie_rating_timestamp  191540 non-null  datetime64[ns]
 5   user_movie_rating_year       191540 non-null  int64         
 6   movie_id                     191540 non-null  int64         
 7   movie_seq                    191540 non-null  int64         
 8   movie_title                  191540 non-null  string        
 9   movie_genres                 191540 non-null  object        
 10  movie_for_adults             191540 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [13]:
columns = ['movie_id', 'movie_release_year', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18515 non-null  int64 
 1   movie_release_year  18515 non-null  int64 
 2   title               18515 non-null  string
 3   overview            18515 non-null  string
 4   overview_tokens     18515 non-null  object
dtypes: int64(2), object(1), string(2)
memory usage: 723.4+ KB


## Definicion del modelo

In [20]:
from sentence_transformers import SentenceTransformer

# 'all-MiniLM-L6-v2'
# 'bert-base-nli-mean-tokens'

sbert_model = SentenceTransformer('all-mpnet-base-v2')

2022-07-25 22:04:48,502 - INFO - Load pretrained SentenceTransformer: all-mpnet-base-v2


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

2022-07-25 22:06:06,968 - INFO - Use pytorch device: cuda


## Generacion de embeddings

In [21]:
embeddings = sbert_model.encode(movie_data[[FIELD]].values.reshape(-1))

movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embeddings, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

Batches:   0%|          | 0/579 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18515 non-null  int64 
 1   movie_release_year  18515 non-null  int64 
 2   title               18515 non-null  string
 3   overview            18515 non-null  string
 4   overview_tokens     18515 non-null  object
 5   overview_embedding  18515 non-null  object
dtypes: int64(2), object(2), string(2)
memory usage: 868.0+ KB


## Evaluación

In [26]:
FIELD

In [25]:
EMBEDDING_PATH

In [23]:
recommender = rc.DistanceMatrixRecommender(
    df      = pd.read_json(EMBEDDING_PATH),
    column  = f'{FIELD}_embedding', 
    device  = get_device()
)

Building Distances Matrix:   0%|          | 0/18515 [00:00<?, ?it/s]

In [24]:
result = recommender.recommend(item_index=0, k=50)
result.show()


Recommender: overview
Item


Unnamed: 0,id,title
0,1,Toy Story


Recommendations


Unnamed: 0,index,distance,id,title,overview
0,0,0.0,1,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,19,0.166989,78499,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven..."
2,2,0.182994,3114,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy..."
3,1213,0.517923,1993,Child's Play 3,It's been eight years since the events in the ...
4,117,0.518093,6936,Elf,When young Buddy falls into Santa's gift sack ...
5,2854,0.525664,596,Pinocchio,Lonely toymaker Geppetto has his wishes answer...
6,9665,0.529668,115875,Toy Story Toons: Hawaiian Vacation,The toys throw Ken and Barbie a Hawaiian vacat...
7,2499,0.533991,103048,"Kings of Summer, The","Joe Toy, on the verge of adolescence, finds hi..."
8,8306,0.549968,106022,Toy Story of Terror,What starts out as a fun road trip for the Toy...
9,12618,0.551404,73042,Alvin and the Chipmunks: The Squeakquel,"Pop sensations Alvin, Simon and Theodore end u..."
