Reference: https://github.com/alineberry/my-movie-recommender/blob/master/notebooks/movie_similarity/autoencoder.ipynb

# Models: Movie Title Sparse Autoencoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
from bunch import Bunch

import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

import pytorch_common.util as pu
from pytorch_common.modules.fn import Fn
from pytorch_common.callbacks import SaveBestModel
from pytorch_common.callbacks.output import Logger

from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dtjo

import logging
import random

import recommender as rc

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

In [11]:
FIELD = 'title'
WEIGHTS_PATH   = f'../weights/{FIELD}-tf-idf-sparse-auto-encoder.pt'
EMBEDDING_PATH = f'../datasets/movie_{FIELD}_embedding.json'

## Carga de dataset

In [12]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2005) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191540 entries, 0 to 191539
Data columns (total 16 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      191540 non-null  int64         
 1   user_seq                     191540 non-null  int64         
 2   user_movie_tags              191540 non-null  object        
 3   user_movie_rating            191540 non-null  int64         
 4   user_movie_rating_timestamp  191540 non-null  datetime64[ns]
 5   user_movie_rating_year       191540 non-null  int64         
 6   movie_id                     191540 non-null  int64         
 7   movie_seq                    191540 non-null  int64         
 8   movie_title                  191540 non-null  string        
 9   movie_genres                 191540 non-null  object        
 10  movie_for_adults             191540 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [13]:
columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            18515 non-null  int64 
 1   release_year  18515 non-null  int64 
 2   imdb_id       18515 non-null  int64 
 3   title         18515 non-null  string
 4   title_tokens  18515 non-null  object
dtypes: int64(3), object(1), string(1)
memory usage: 723.4+ KB


In [14]:
tfidf = movie_data.pipe(dt.tf_idf, f'{FIELD}_tokens')

tfidf.shape

## Definicion del modelo

In [15]:
def train(auto_encoder, tfidf, params):
    train_set = DataLoader(
        ds.TfIdfDataset(tfidf), 
        params.batch_size, 
        num_workers=params.n_workers, 
        pin_memory=True,
        shuffle=True
    )
    ml.AutoEncoderTrainer(auto_encoder).fit(
        train_set,
        loss_fn = ml.MSELossFn(reduction='elementwise_mean'),
        epochs  = params.epochs,
        encoder_optimizer = Adam(auto_encoder.encoder.parameters(), lr= params.lr),
        decoder_optimizer = Adam(auto_encoder.decoder.parameters(), lr= params.lr),
        callbacks=[Logger(['time', 'epoch', 'train_loss'])]
    )

## Entrenamiento

In [16]:
params = Bunch({
    'lr': 0.01,
    'epochs': 20,
    'n_workers': 24,
    'batch_size': 128,
    'sequence_size':  tfidf.shape[1],
    'intermediate_size': 5000,
    'encoded_size': 1000,
    'experiment_name': f'{FIELD}-tf-idf-sparse-auto-encoder',
    'device': get_device()
})

In [17]:
auto_encoder = ml.AutoEncoder(
    params.sequence_size, 
    params.intermediate_size, 
    params.encoded_size
).to(get_device())
print(auto_encoder)

In [18]:
train(auto_encoder, tfidf, params)

2022-07-29 17:48:22,250 - INFO - {'time': '0:00:02.74', 'epoch': 1, 'train_loss': 0.13096719898026565}
2022-07-29 17:48:24,481 - INFO - {'time': '0:00:02.23', 'epoch': 2, 'train_loss': 0.04372454859830182}
2022-07-29 17:48:26,735 - INFO - {'time': '0:00:02.25', 'epoch': 3, 'train_loss': 0.024771234174740725}
2022-07-29 17:48:29,015 - INFO - {'time': '0:00:02.28', 'epoch': 4, 'train_loss': 0.017537143190616163}
2022-07-29 17:48:31,279 - INFO - {'time': '0:00:02.26', 'epoch': 5, 'train_loss': 0.008161570674514977}
2022-07-29 17:48:33,611 - INFO - {'time': '0:00:02.33', 'epoch': 6, 'train_loss': 0.004262984971162574}
2022-07-29 17:48:35,918 - INFO - {'time': '0:00:02.31', 'epoch': 7, 'train_loss': 0.002888988139878573}
2022-07-29 17:48:38,184 - INFO - {'time': '0:00:02.27', 'epoch': 8, 'train_loss': 0.0021343948528299043}
2022-07-29 17:48:40,435 - INFO - {'time': '0:00:02.25', 'epoch': 9, 'train_loss': 0.0016794321946157463}
2022-07-29 17:48:42,706 - INFO - {'time': '0:00:02.27', 'epoch':

In [19]:
torch.save(auto_encoder.state_dict(), WEIGHTS_PATH)

## Generacion de embeddings

In [20]:
embedding = auto_encoder.to(cpu).encode_from_batch(torch.tensor(tfidf.toarray()))
embedding.shape

In [21]:
movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embedding, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               18515 non-null  int64 
 1   release_year     18515 non-null  int64 
 2   imdb_id          18515 non-null  int64 
 3   title            18515 non-null  string
 4   title_tokens     18515 non-null  object
 5   title_embedding  18515 non-null  object
dtypes: int64(3), object(2), string(1)
memory usage: 868.0+ KB


## Evaluaci√≥n

In [22]:
df = pd.read_json(EMBEDDING_PATH)

In [23]:
recommender = rc.DistanceMatrixRecommender(
    df,
    column  = f'{FIELD}_embedding', 
    device  = get_device()
)

Building Distances Matrix:   0%|          | 0/18515 [00:00<?, ?it/s]

In [24]:
result = recommender.recommend(item_index=0)
result.show()


Recommender: title
Item


Unnamed: 0,title,image
0,Toy Story,


Recommendations


Unnamed: 0,index,distance,id,title,image
0,0,0.0,1,Toy Story,
1,2,0.0,3114,Toy Story 2,
2,19,0.0,78499,Toy Story 3,
3,9019,0.027277,4189,"Greatest Story Ever Told, The",
4,7627,0.033286,93287,"Big Year, The",
