# Models: Movie Genres Sparse Autoencoder

[Reference](https://github.com/alineberry/my-movie-recommender/blob/master/notebooks/movie_similarity/autoencoder.ipynb)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_PATH      = '../../..'
LIB_PATH       = f'{BASE_PATH}/lib'   
DATASET_PATH   = f'{BASE_PATH}/datasets'
FIELD          = 'genres'
MODEL          = 'tf-idf-sparse-auto-encoder'
WEIGHTS_PATH   = f'{BASE_PATH}/weights/{FIELD}-{MODEL}.pt'

In [3]:
import sys
sys.path.append(LIB_PATH)

import numpy as np
import pandas as pd
from bunch import Bunch

import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

import pytorch_common.util as pu
from pytorch_common.modules.fn import Fn
from pytorch_common.callbacks import SaveBestModel
from pytorch_common.callbacks.output import Logger

from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import logging

import util as ut

from recommender import ItemRecommenderBuilder, item_rec_sys_cfg

2023-05-05 18:33:40.043848: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using embedded DuckDB without persistence: data will be transient


<Figure size 640x480 with 0 Axes>

## Setup

In [4]:
pu.LoggerBuilder().on_console().build()

<RootLogger root (INFO)>

In [5]:
pu.set_device_name('gpu')

cpu = torch.device("cpu")
gpu = pu.get_device()

pu.get_device(), torch.cuda.is_available(), torch.__version__, torch.cuda.get_arch_list()

(device(type='cuda', index=0),
 True,
 '1.11.0',
 ['sm_37',
  'sm_50',
  'sm_60',
  'sm_61',
  'sm_70',
  'sm_75',
  'sm_80',
  'sm_86',
  'compute_37'])

In [6]:
ut.set_seed(42)

In [7]:
REC_SYS_CFG = item_rec_sys_cfg(DATASET_PATH, FIELD, MODEL)

2023-05-05 18:33:41,072 - INFO - Cfg:

embedding_col: genres_embedding
file_path: ../../../datasets/genres-tf-idf-sparse-auto-encoder.json
metadata_cols:
- genres
- release_year
- imdb_id
- genres_tokens
- title
name: genres-tf-idf-sparse-auto-encoder



## Carga de dataset

In [8]:
dataset = ds.MovieLensTMDBDataLoader.df_from_path(DATASET_PATH)

columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title',
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.join_str_list, FIELD) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

In [9]:
tfidf = movie_data.pipe(dt.tf_idf, f'{FIELD}_tokens')

tfidf.shape

(17611, 23)

## Definicion del modelo

In [10]:
def train(auto_encoder, tfidf, params):
    train_set = DataLoader(
        ds.TfIdfDataset(tfidf), 
        params.batch_size, 
        num_workers=params.n_workers, 
        pin_memory=True,
        shuffle=True
    )
    ml.AutoEncoderTrainer(auto_encoder).fit(
        train_set,
        loss_fn = ml.MSELossFn(reduction='elementwise_mean'),
        epochs  = params.epochs,
        encoder_optimizer = Adam(auto_encoder.encoder.parameters(), lr= params.lr),
        decoder_optimizer = Adam(auto_encoder.decoder.parameters(), lr= params.lr),
        callbacks=[Logger(['time', 'epoch', 'train_loss'])]
    )

## Entrenamiento

In [11]:
params = Bunch({
    'lr': 0.01,
    'epochs': 5,
    'n_workers': 24,
    'batch_size': 32,
    'sequence_size':  tfidf.shape[1],
    'intermediate_size': 500,
    'encoded_size': 10,
    'experiment_name': f'{FIELD}-tf-idf-sparse-auto-encoder',
    'device': get_device()
})

In [12]:
auto_encoder = ml.AutoEncoder(
    params.sequence_size, 
    params.intermediate_size, 
    params.encoded_size
).to(get_device())
print(auto_encoder)

In [13]:
train(auto_encoder, tfidf, params)

2023-05-05 18:33:51,776 - INFO - {'time': '0:00:01.84', 'epoch': 1, 'train_loss': 0.05191616194251715}
2023-05-05 18:33:53,181 - INFO - {'time': '0:00:01.40', 'epoch': 2, 'train_loss': 0.014652363704128729}
2023-05-05 18:33:54,617 - INFO - {'time': '0:00:01.44', 'epoch': 3, 'train_loss': 0.011727860596321672}
2023-05-05 18:33:56,034 - INFO - {'time': '0:00:01.42', 'epoch': 4, 'train_loss': 0.010482297914471146}
2023-05-05 18:33:57,534 - INFO - {'time': '0:00:01.50', 'epoch': 5, 'train_loss': 0.009861044327943111}


In [14]:
torch.save(auto_encoder.state_dict(), WEIGHTS_PATH)

## Generacion de embeddings

In [15]:
embedding = auto_encoder.to(cpu).encode_from_batch(torch.tensor(tfidf.toarray()))
embedding.shape

torch.Size([17611, 10])

In [16]:
movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embedding, FIELD)

movie_data.to_json(REC_SYS_CFG.file_path)

In [17]:
movie_data.head()

Unnamed: 0,id,release_year,imdb_id,title,genres,genres_tokens,genres_embedding
0,1,1995,114709,Toy Story,Adventure Animation Children Comedy Fantasy,adventure animation children comedy fantasy,"[0.6501747, 0.48187065, 0.20276023, 0.0, 0.843..."
1,2355,1998,120623,"Bug's Life, A",Adventure Animation Children Comedy,adventure animation children comedy,"[0.81594044, 0.0, 0.6351735, 0.0, 1.0622308, 2..."
2,3114,1999,120363,Toy Story 2,Adventure Animation Children Comedy Fantasy,adventure animation children comedy fantasy,"[0.6501747, 0.48187065, 0.20276023, 0.0, 0.843..."
3,4306,2001,126029,Shrek,Adventure Animation Children Comedy Fantasy Ro...,adventure animation children comedy fantasy ro...,"[0.5222425, 1.1921929, 0.34473163, 0.0, 0.8808..."
4,4886,2001,198781,"Monsters, Inc.",Adventure Animation Children Comedy Fantasy,adventure animation children comedy fantasy,"[0.6501747, 0.48187065, 0.20276023, 0.0, 0.843..."


## Evaluación

In [18]:
builder = ItemRecommenderBuilder(DATASET_PATH, [REC_SYS_CFG])

2023-05-05 18:33:58,062 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-05-05 18:33:58,229 - INFO - Use pytorch device: cuda


In [19]:
builder \
    .item_recommender(REC_SYS_CFG.name, n_sim_items = 10) \
    .recommend(item_id=1) \
    .show()

2023-05-05 18:34:04,527 - INFO - Found 1 items by ids: [1].
2023-05-05 18:34:04,528 - INFO - Found 10 similar to 1 item.



Item Recommender: genres-tf-idf-sparse-auto-encoder



Unnamed: 0,Similarity,Rating,.,Recommended Movies,..,Already seen movies,Rating.1
0,1.0,3.8,We Recommend ==>,,==> Because You Saw ==>,,3.7
1,1.0,3.4,We Recommend ==>,,==> Because You Saw ==>,,3.7
2,1.0,2.7,We Recommend ==>,,==> Because You Saw ==>,,3.7
3,1.0,3.4,We Recommend ==>,,==> Because You Saw ==>,,3.7
4,1.0,3.3,We Recommend ==>,,==> Because You Saw ==>,,3.7
