Reference: https://github.com/alineberry/my-movie-recommender/blob/master/notebooks/movie_similarity/autoencoder.ipynb

# Models: Sparse Autoencoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
from bunch import Bunch

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torch.optim import SparseAdam, Adam
from sklearn.metrics import roc_auc_score

import pytorch_common.util as pu
from pytorch_common.modules.fn import Fn
from pytorch_common.callbacks import EarlyStop, \
                                     ReduceLROnPlateau, \
                                     Validation, \
                                     SaveBestModel
from pytorch_common.callbacks.output import Logger, \
                                            MetricsPlotter

from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data.dataset as ds

import metric as mt
import metric.discretizer as dr

import data.plot as pl
import data as dt

import logging
import random

from torchviz import make_dot

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

## Carga de dataset

In [11]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

features_fn = lambda obs, device: to_tensor(obs, device, ['user_seq', 'movie_seq'])
target_fn   = lambda obs, device: to_tensor(obs, device, ['user_movie_rating'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = features_fn,
    target_transform = target_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 1990) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199791 entries, 0 to 199790
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      199791 non-null  int64         
 1   user_seq                     199791 non-null  int64         
 2   user_movie_tags              199791 non-null  object        
 3   user_movie_rating            199791 non-null  int64         
 4   user_movie_rating_timestamp  199791 non-null  datetime64[ns]
 5   user_movie_rating_year       199791 non-null  int64         
 6   movie_id                     199791 non-null  int64         
 7   movie_seq                    199791 non-null  int64         
 8   movie_title                  199791 non-null  string        
 9   movie_genres                 199791 non-null  object        
 10  movie_for_adults             199791 non-null  bool          
 11  movie_original_language   

In [12]:
train_set, eval_set = dataset.split_train_eval(split_year=2018)

2022-06-04 17:34:31,649 - INFO - Train: 84.41 % - Test: 7.40 %


## Definicion del modelo

In [13]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfVectorizer

def is_stop_word(token):
    return not token.is_stop and not token.is_punct and not token.like_num


class TokenizerService:
    def __init__(self, nlp = English()):
        self.tokenizer = nlp.tokenizer

    def __call__(self, text):
        return [token.text for token in self.tokenizer(text) if is_stop_word(token)]

class TfIdfGenerator:
    def __init__(self, ngram_range=(1, 1), min_df=0.0001, stop_words='english'):
        self._vectorizer = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, stop_words=stop_words)
    
    def __call__(self, documents):
        return self._vectorizer.fit_transform(documents)

In [14]:
tokenizer = TokenizerService()
overviews = train_set.data.movie_overview.apply(lambda x: ' '.join(tokenizer(x)))

In [15]:
tfidf_matrix = TfIdfGenerator()(overviews)

In [16]:
tfidf_matrix.shape, overviews.shape

In [17]:
seq_size = tfidf_matrix.shape[1]
seq_size

In [18]:
from torch.utils.data import Dataset

class TfIdfDataset(Dataset):
    def __init__(self, matrix): self.matrix = matrix
    def __len__(self): return self.matrix.shape[0]
    @property
    def shape(self): return self.matrix.shape
    def __getitem__(self, idx): return self._get_row(idx), self._get_row(idx)
    def _get_row(self, idx): 
        return torch.tensor(self.matrix.getrow(idx).toarray(), dtype=torch.float).squeeze(0)

In [19]:
tfIdfDataset = TfIdfDataset(tfidf_matrix)

In [20]:
params = Bunch({
    'lr': 0.001,
    'epochs': 10,
    'n_workers': 24,
    'batch_size': 512,
    'device': get_device()
})

In [21]:
observation = tfIdfDataset[0]
observation, observation[0].shape, observation[1].shape 

In [22]:
train_set = DataLoader(tfIdfDataset, params.batch_size, num_workers=params.n_workers, pin_memory=True)

In [23]:
auto_encoder = ml.AutoEncoder(seq_size, intermediate_size=5000, encoded_size=1000)
auto_encoder.to(get_device())

In [24]:
trainer = ml.AutoEncoderTrainer(auto_encoder)

In [25]:
trainer.fit(
    train_set,
    loss_fn = ml.MSELossFn(reduction='elementwise_mean'),
    epochs  = params.epochs,
    encoder_optimizer = Adam(auto_encoder.encoder.parameters(), lr= params.lr),
    decoder_optimizer = Adam(auto_encoder.decoder.parameters(), lr= params.lr),
    callbacks=[
         Logger(['time', 'epoch', 'train_loss'])
    ]
)

2022-06-04 17:35:02,783 - INFO - {'time': '0:00:17.54', 'epoch': 1, 'train_loss': 0.19872012309955828}
2022-06-04 17:35:19,708 - INFO - {'time': '0:00:16.92', 'epoch': 2, 'train_loss': 0.1367862779082674}
2022-06-04 17:35:36,411 - INFO - {'time': '0:00:16.70', 'epoch': 3, 'train_loss': 0.09901248142123223}
2022-06-04 17:35:53,148 - INFO - {'time': '0:00:16.74', 'epoch': 4, 'train_loss': 0.07553321565642501}
2022-06-04 17:36:09,920 - INFO - {'time': '0:00:16.77', 'epoch': 5, 'train_loss': 0.06046720524177407}
2022-06-04 17:36:26,711 - INFO - {'time': '0:00:16.79', 'epoch': 6, 'train_loss': 0.050481820580634204}
2022-06-04 17:36:43,450 - INFO - {'time': '0:00:16.74', 'epoch': 7, 'train_loss': 0.0436331703638037}
2022-06-04 17:37:00,224 - INFO - {'time': '0:00:16.77', 'epoch': 8, 'train_loss': 0.036804082238990246}
2022-06-04 17:37:16,980 - INFO - {'time': '0:00:16.76', 'epoch': 9, 'train_loss': 0.03256574885524584}
2022-06-04 17:37:33,741 - INFO - {'time': '0:00:16.76', 'epoch': 10, 'tra

In [34]:
original_data, _ = tfIdfDataset[0]
original_data = torch.tensor(original_data, dtype=torch.float)

original_data = torch.tensor(original_data, dtype=torch.float).unsqueeze(0)
original_data.shape

  original_data = torch.tensor(original_data, dtype=torch.float)
  original_data = torch.tensor(original_data, dtype=torch.float).unsqueeze(0)


In [36]:
compresed_data = auto_encoder.encoded_representation(original_data)
compresed_data.shape