Reference: https://github.com/alineberry/my-movie-recommender/blob/master/notebooks/movie_similarity/autoencoder.ipynb

# Models: Movie Overview Sparse Autoencoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
from bunch import Bunch

import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

import pytorch_common.util as pu
from pytorch_common.modules.fn import Fn
from pytorch_common.callbacks import SaveBestModel
from pytorch_common.callbacks.output import Logger

from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dtjo

import logging
import random

import recommender as rc

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

In [11]:
FIELD = 'overview'
WEIGHTS_PATH   = f'../weights/{FIELD}-tf-idf-sparse-auto-encoder.pt'
EMBEDDING_PATH = f'../datasets/movie_{FIELD}_embedding.json'

## Carga de dataset

In [12]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 1990) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199791 entries, 0 to 199790
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      199791 non-null  int64         
 1   user_seq                     199791 non-null  int64         
 2   user_movie_tags              199791 non-null  object        
 3   user_movie_rating            199791 non-null  int64         
 4   user_movie_rating_timestamp  199791 non-null  datetime64[ns]
 5   user_movie_rating_year       199791 non-null  int64         
 6   movie_id                     199791 non-null  int64         
 7   movie_seq                    199791 non-null  int64         
 8   movie_title                  199791 non-null  string        
 9   movie_genres                 199791 non-null  object        
 10  movie_for_adults             199791 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [13]:
columns = ['movie_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18703 entries, 0 to 18702
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               18703 non-null  int64 
 1   title            18703 non-null  string
 2   overview         18703 non-null  string
 3   overview_tokens  18703 non-null  object
dtypes: int64(1), object(1), string(2)
memory usage: 584.6+ KB


In [14]:
tfidf = movie_data.pipe(dt.tf_idf, f'{FIELD}_tokens')

tfidf.shape

## Definicion del modelo

In [15]:
def train(auto_encoder, tfidf, params):
    train_set = DataLoader(
        ds.TfIdfDataset(tfidf), 
        params.batch_size, 
        num_workers=params.n_workers, 
        pin_memory=True,
        shuffle=True
    )
    ml.AutoEncoderTrainer(auto_encoder).fit(
        train_set,
        loss_fn = ml.MSELossFn(reduction='elementwise_mean'),
        epochs  = params.epochs,
        encoder_optimizer = Adam(auto_encoder.encoder.parameters(), lr= params.lr),
        decoder_optimizer = Adam(auto_encoder.decoder.parameters(), lr= params.lr),
        callbacks=[Logger(['time', 'epoch', 'train_loss'])]
    )

## Entrenamiento

In [16]:
params = Bunch({
    'lr': 0.01,
    'epochs': 20,
    'n_workers': 24,
    'batch_size': 128,
    'sequence_size':  tfidf.shape[1],
    'intermediate_size': 5000,
    'encoded_size': 1000,
    'experiment_name': f'{FIELD}-tf-idf-sparse-auto-encoder',
    'device': get_device()
})

In [17]:
auto_encoder = ml.AutoEncoder(
    params.sequence_size, 
    params.intermediate_size, 
    params.encoded_size
).to(get_device())
print(auto_encoder)

In [18]:
train(auto_encoder, tfidf, params)

2022-06-06 16:37:50,534 - INFO - {'time': '0:00:08.16', 'epoch': 1, 'train_loss': 0.13490777189026074}
2022-06-06 16:37:58,038 - INFO - {'time': '0:00:07.50', 'epoch': 2, 'train_loss': 0.04399084455656762}
2022-06-06 16:38:05,545 - INFO - {'time': '0:00:07.51', 'epoch': 3, 'train_loss': 0.023309793190232346}
2022-06-06 16:38:12,967 - INFO - {'time': '0:00:07.42', 'epoch': 4, 'train_loss': 0.009790689630598641}
2022-06-06 16:38:20,428 - INFO - {'time': '0:00:07.46', 'epoch': 5, 'train_loss': 0.005044325237434839}
2022-06-06 16:38:27,870 - INFO - {'time': '0:00:07.44', 'epoch': 6, 'train_loss': 0.003299643950802939}
2022-06-06 16:38:35,302 - INFO - {'time': '0:00:07.43', 'epoch': 7, 'train_loss': 0.0023722691733871594}
2022-06-06 16:38:42,748 - INFO - {'time': '0:00:07.45', 'epoch': 8, 'train_loss': 0.0017934185262693434}
2022-06-06 16:38:50,147 - INFO - {'time': '0:00:07.40', 'epoch': 9, 'train_loss': 0.0014219269937309786}
2022-06-06 16:38:57,633 - INFO - {'time': '0:00:07.48', 'epoch'

In [19]:
torch.save(auto_encoder.state_dict(), WEIGHTS_PATH)

## Generacion de embeddings

In [20]:
embedding = auto_encoder.to(cpu).encode_from_batch(torch.tensor(tfidf.toarray()))
embedding.shape

In [21]:
movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embedding, FIELD) \
    .pipe(dt.drop, [f'{FIELD}_tokens'])

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18703 entries, 0 to 18702
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18703 non-null  int64 
 1   title               18703 non-null  string
 2   overview            18703 non-null  string
 3   overview_embedding  18703 non-null  object
dtypes: int64(1), object(1), string(2)
memory usage: 584.6+ KB


## Evaluación

In [24]:
recommender = rc.SimilarMoviesRecommender(
    df      = pd.read_json(EMBEDDING_PATH), 
    column  = f'{FIELD}_embedding', 
    device  = get_device()
)

Building Distances Matrix:   0%|          | 0/18703 [00:00<?, ?it/s]

In [25]:
recommender.similars(movie_index=0)

Unnamed: 0,distance,title,overview
0,0.0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
4555,0.083342,F.I.S.T.,Johnny Kovak joins the Teamsters trade-union i...
8722,0.177943,"Galapagos Affair: Satan Came to Eden, The",Darwin meets Hitchcock in this documentary. Di...
12927,0.111965,Arnulf Rainer,"An experimental film, the last in Peter Kubelk..."
13035,0.180909,Bad Company,Nelson Crowe is a CIA operative under the thum...
