Reference: https://github.com/alineberry/my-movie-recommender/blob/master/notebooks/movie_similarity/autoencoder.ipynb

# Models: Movie Overview Sparse Autoencoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
from bunch import Bunch

import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

import pytorch_common.util as pu
from pytorch_common.modules.fn import Fn
from pytorch_common.callbacks import SaveBestModel
from pytorch_common.callbacks.output import Logger

from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dtjo

import logging
import random

import recommender as rc

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

In [11]:
FIELD = 'overview'
WEIGHTS_PATH   = f'../weights/{FIELD}-tf-idf-sparse-auto-encoder.pt'
EMBEDDING_PATH = f'../datasets/movie_{FIELD}_embedding.json'

## Carga de dataset

In [12]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2005) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191540 entries, 0 to 191539
Data columns (total 16 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      191540 non-null  int64         
 1   user_seq                     191540 non-null  int64         
 2   user_movie_tags              191540 non-null  object        
 3   user_movie_rating            191540 non-null  int64         
 4   user_movie_rating_timestamp  191540 non-null  datetime64[ns]
 5   user_movie_rating_year       191540 non-null  int64         
 6   movie_id                     191540 non-null  int64         
 7   movie_seq                    191540 non-null  int64         
 8   movie_title                  191540 non-null  string        
 9   movie_genres                 191540 non-null  object        
 10  movie_for_adults             191540 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [13]:
columns = ['movie_id', 'movie_release_year', 'movie_imdb_id', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title',
        'movie_imdb_id': 'imdb_id',  
        'movie_release_year': 'release_year',
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               18515 non-null  int64 
 1   release_year     18515 non-null  int64 
 2   imdb_id          18515 non-null  int64 
 3   title            18515 non-null  string
 4   overview         18515 non-null  string
 5   overview_tokens  18515 non-null  object
dtypes: int64(3), object(1), string(2)
memory usage: 868.0+ KB


In [14]:
movie_data

Unnamed: 0,id,release_year,imdb_id,title,overview,overview_tokens
0,1,1995,114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",led woody andy toys live happily room andy bir...
1,2355,1998,120623,"Bug's Life, A","On behalf of ""oppressed bugs everywhere,"" an i...",behalf oppressed bugs inventive ant named flik...
2,3114,1999,120363,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",andy heads cowboy camp leaving toys devices th...
3,4306,2001,126029,Shrek,It ain't easy bein' green -- especially if you...,ai easy bein green especially likable albeit s...
4,4886,2001,198781,"Monsters, Inc.","James Sullivan and Mike Wazowski are monsters,...",james sullivan mike wazowski monsters earn liv...
...,...,...,...,...,...,...
18510,173173,2017,6772874,This Is Not What I Expected,"Lu Jin is a handsome, wealthy hotel executive ...",lu jin handsome wealthy hotel executive drive ...
18511,174399,2012,2214941,Daddy's Little Girl,After the police find Derek’s daughter brutall...,police find derek daughter brutally murdered b...
18512,174443,2016,4303202,American Wrestler: The Wizard,"In 1980, a teenage boy escapes the unrest in I...",teenage boy escapes unrest iran face hostility...
18513,174505,2016,3750238,Besetment,"After struggling to find employment, Amanda ta...",struggling find employment amanda takes hotel ...


In [15]:
tfidf = movie_data.pipe(dt.tf_idf, f'{FIELD}_tokens')

tfidf.shape

## Definicion del modelo

In [16]:
def train(auto_encoder, tfidf, params):
    train_set = DataLoader(
        ds.TfIdfDataset(tfidf), 
        params.batch_size, 
        num_workers=params.n_workers, 
        pin_memory=True,
        shuffle=True
    )
    ml.AutoEncoderTrainer(auto_encoder).fit(
        train_set,
        loss_fn = ml.MSELossFn(reduction='elementwise_mean'),
        epochs  = params.epochs,
        encoder_optimizer = Adam(auto_encoder.encoder.parameters(), lr= params.lr),
        decoder_optimizer = Adam(auto_encoder.decoder.parameters(), lr= params.lr),
        callbacks=[Logger(['time', 'epoch', 'train_loss'])]
    )

## Entrenamiento

In [17]:
params = Bunch({
    'lr': 0.01,
    'epochs': 20,
    'n_workers': 24,
    'batch_size': 128,
    'sequence_size':  tfidf.shape[1],
    'intermediate_size': 5000,
    'encoded_size': 1000,
    'experiment_name': f'{FIELD}-tf-idf-sparse-auto-encoder',
    'device': get_device()
})

In [18]:
auto_encoder = ml.AutoEncoder(
    params.sequence_size, 
    params.intermediate_size, 
    params.encoded_size
).to(get_device())
print(auto_encoder)

In [19]:
train(auto_encoder, tfidf, params)

2022-07-29 00:36:02,245 - INFO - {'time': '0:00:08.42', 'epoch': 1, 'train_loss': 0.1365766378569192}
2022-07-29 00:36:09,539 - INFO - {'time': '0:00:07.29', 'epoch': 2, 'train_loss': 0.04205020523533739}
2022-07-29 00:36:16,916 - INFO - {'time': '0:00:07.38', 'epoch': 3, 'train_loss': 0.02329098038889211}
2022-07-29 00:36:24,575 - INFO - {'time': '0:00:07.66', 'epoch': 4, 'train_loss': 0.011851948162476564}
2022-07-29 00:36:31,988 - INFO - {'time': '0:00:07.41', 'epoch': 5, 'train_loss': 0.005782420059730267}
2022-07-29 00:36:39,281 - INFO - {'time': '0:00:07.29', 'epoch': 6, 'train_loss': 0.0034741654510385004}
2022-07-29 00:36:46,602 - INFO - {'time': '0:00:07.32', 'epoch': 7, 'train_loss': 0.002455443039472247}
2022-07-29 00:36:53,950 - INFO - {'time': '0:00:07.35', 'epoch': 8, 'train_loss': 0.0018479966670917026}
2022-07-29 00:37:01,301 - INFO - {'time': '0:00:07.35', 'epoch': 9, 'train_loss': 0.0014516139322699143}
2022-07-29 00:37:08,658 - INFO - {'time': '0:00:07.36', 'epoch': 

In [20]:
torch.save(auto_encoder.state_dict(), WEIGHTS_PATH)

## Generacion de embeddings

In [21]:
embedding = auto_encoder.to(cpu).encode_from_batch(torch.tensor(tfidf.toarray()))
embedding.shape

In [22]:
movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embedding, FIELD)

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  18515 non-null  int64 
 1   release_year        18515 non-null  int64 
 2   imdb_id             18515 non-null  int64 
 3   title               18515 non-null  string
 4   overview            18515 non-null  string
 5   overview_tokens     18515 non-null  object
 6   overview_embedding  18515 non-null  object
dtypes: int64(3), object(2), string(2)
memory usage: 1012.7+ KB


## Evaluación

In [23]:
df = pd.read_json(EMBEDDING_PATH)

In [24]:
recommender = rc.DistanceMatrixRecommender(
    df,
    column  = f'{FIELD}_embedding', 
    device  = get_device()
)

Building Distances Matrix:   0%|          | 0/18515 [00:00<?, ?it/s]

In [25]:
result = recommender.recommend(item_index=0)
result.show()


Recommender: overview
Item


Unnamed: 0,id,title,imdb_id,image
0,1,Toy Story,114709,


Recommendations


Unnamed: 0,index,distance,id,title,imdb_id,image
0,0,0.0,1,Toy Story,114709,
1,17966,0.005908,6176,Mr. Majestyk,71866,
2,387,0.006211,106916,American Hustle,1800241,
3,2911,0.007153,7834,After the Thin Man,27260,
4,2076,0.007541,80451,Season of the Witch,69239,
