Reference: https://github.com/alineberry/my-movie-recommender/blob/master/notebooks/movie_similarity/autoencoder.ipynb

# Models: Movie Tags Sparse Autoencoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import numpy as np
import pandas as pd
from bunch import Bunch

import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

import pytorch_common.util as pu
from pytorch_common.modules.fn import Fn
from pytorch_common.callbacks import SaveBestModel
from pytorch_common.callbacks.output import Logger

from pytorch_common.util import set_device_name, \
                                get_device, \
                                LoggerBuilder

import model as ml
import data as dt
import data.dataset as ds

import data.plot as pl
import data as dtjo

import logging
import random

import recommender as rc

## Setup

In [3]:
pu.LoggerBuilder().on_console().build()

In [4]:
pu.set_device_name('gpu')

In [5]:
pu.get_device()

In [6]:
cpu = torch.device("cpu")
gpu = pu.get_device()

In [7]:
torch.cuda.is_available()

In [8]:
torch.__version__

In [9]:
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)

In [10]:
set_seed(42)

In [11]:
FIELD = 'tags'
WEIGHTS_PATH   = f'../weights/{FIELD}-tf-idf-sparse-auto-encoder.pt'
EMBEDDING_PATH = f'../datasets/movie_{FIELD}_embedding.json'

## Carga de dataset

In [12]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

transform_fn = lambda obs, device: to_tensor(obs, device, [f'movie_{FIELD}'])

dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    transform        = transform_fn,
    target_transform = transform_fn,
    device           = cpu,
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2005) & (df['user_movie_rating_year'] <= 2019)]
)
dataset.info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191540 entries, 0 to 191539
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      191540 non-null  int64         
 1   user_seq                     191540 non-null  int64         
 2   user_movie_tags              191540 non-null  object        
 3   user_movie_rating            191540 non-null  int64         
 4   user_movie_rating_timestamp  191540 non-null  datetime64[ns]
 5   user_movie_rating_year       191540 non-null  int64         
 6   movie_id                     191540 non-null  int64         
 7   movie_seq                    191540 non-null  int64         
 8   movie_title                  191540 non-null  string        
 9   movie_genres                 191540 non-null  object        
 10  movie_for_adults             191540 non-null  bool          
 11  movie_original_language   

Select movies overview and add new curated tokens column:

In [13]:
columns = ['movie_id', 'movie_release_year', 'movie_title', f'movie_{FIELD}']

movie_data = dataset \
    .data \
    .pipe(dt.select, columns) \
    .pipe(dt.distinct, ['movie_id']) \
    .pipe(dt.rename, {
        'movie_id': 'id', 
        'movie_title': 'title', 
        f'movie_{FIELD}': FIELD
    }) \
    .pipe(dt.tokenize, FIELD) \
    .pipe(dt.concat_columns, f'{FIELD}_tokens', 'movie_release_year') \
    .pipe(dt.reset_index)

movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           18515 non-null  int64 
 1   title        18515 non-null  string
 2   tags         18515 non-null  object
 3   tags_tokens  18515 non-null  object
dtypes: int64(1), object(2), string(1)
memory usage: 578.7+ KB


In [14]:
tfidf = movie_data.pipe(dt.tf_idf, f'{FIELD}_tokens')

tfidf.shape

## Definicion del modelo

In [15]:
def train(auto_encoder, tfidf, params):
    train_set = DataLoader(
        ds.TfIdfDataset(tfidf), 
        params.batch_size, 
        num_workers=params.n_workers, 
        pin_memory=True,
        shuffle=True
    )
    ml.AutoEncoderTrainer(auto_encoder).fit(
        train_set,
        loss_fn = ml.MSELossFn(reduction='elementwise_mean'),
        epochs  = params.epochs,
        encoder_optimizer = Adam(auto_encoder.encoder.parameters(), lr= params.lr),
        decoder_optimizer = Adam(auto_encoder.decoder.parameters(), lr= params.lr),
        callbacks=[Logger(['time', 'epoch', 'train_loss'])]
    )

## Entrenamiento

In [16]:
params = Bunch({
    'lr': 0.01,
    'epochs': 10,
    'n_workers': 24,
    'batch_size': 128,
    'sequence_size':  tfidf.shape[1],
    'intermediate_size': 5000,
    'encoded_size': 1000,
    'experiment_name': f'{FIELD}-tf-idf-sparse-auto-encoder',
    'device': get_device()
})

In [17]:
auto_encoder = ml.AutoEncoder(
    params.sequence_size, 
    params.intermediate_size, 
    params.encoded_size
).to(get_device())
print(auto_encoder)

In [18]:
train(auto_encoder, tfidf, params)

2022-07-24 11:00:32,043 - INFO - {'time': '0:00:06.77', 'epoch': 1, 'train_loss': 0.1287818957505555}
2022-07-24 11:00:37,685 - INFO - {'time': '0:00:05.64', 'epoch': 2, 'train_loss': 0.044529518831906645}
2022-07-24 11:00:43,837 - INFO - {'time': '0:00:06.15', 'epoch': 3, 'train_loss': 0.024255451245297644}
2022-07-24 11:00:49,543 - INFO - {'time': '0:00:05.70', 'epoch': 4, 'train_loss': 0.01116216016997551}
2022-07-24 11:00:55,275 - INFO - {'time': '0:00:05.73', 'epoch': 5, 'train_loss': 0.005763329376049083}
2022-07-24 11:01:00,972 - INFO - {'time': '0:00:05.70', 'epoch': 6, 'train_loss': 0.0036514652603916054}
2022-07-24 11:01:06,698 - INFO - {'time': '0:00:05.72', 'epoch': 7, 'train_loss': 0.00255082115884228}
2022-07-24 11:01:12,413 - INFO - {'time': '0:00:05.71', 'epoch': 8, 'train_loss': 0.0019261738646711255}
2022-07-24 11:01:18,302 - INFO - {'time': '0:00:05.89', 'epoch': 9, 'train_loss': 0.0015210683481639315}
2022-07-24 11:01:24,052 - INFO - {'time': '0:00:05.75', 'epoch': 

In [19]:
torch.save(auto_encoder.state_dict(), WEIGHTS_PATH)

## Generacion de embeddings

In [20]:
embedding = auto_encoder.to(cpu).encode_from_batch(torch.tensor(tfidf.toarray()))
embedding.shape

In [21]:
movie_data = movie_data \
    .pipe(dt.append_emb_vectors, embedding, FIELD) \
    .pipe(dt.drop, [f'{FIELD}_tokens'])

movie_data.to_json(EMBEDDING_PATH)
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515 entries, 0 to 18514
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              18515 non-null  int64 
 1   title           18515 non-null  string
 2   tags            18515 non-null  object
 3   tags_embedding  18515 non-null  object
dtypes: int64(1), object(2), string(1)
memory usage: 578.7+ KB


## Evaluación

In [22]:
df = pd.read_json(EMBEDDING_PATH)

In [23]:
recommender = rc.DistanceMatrixRecommender(
    df,
    column = f'{FIELD}_embedding', 
    device = get_device()
)

Building Distances Matrix:   0%|          | 0/18515 [00:00<?, ?it/s]

In [27]:
result = recommender.recommend(item_index=0, k=10)

result.show()


Recommender: tags
Item


Unnamed: 0,id,title
0,1,Toy Story


Recommendations


Unnamed: 0,index,distance,id,title,tags
0,0,0.0,1,Toy Story,2009 reissue in stereoscopic 3-d 3d 55 movies ...
1,2158,0.01092,94780,Snow White and the Huntsman,adapted from:book adventure animal:bird animal...
2,1477,0.011761,4027,"O Brother, Where Art Thou?",1930's 1930s adaptation adapted from:book adve...
3,2656,0.011954,1031,Bedknobs and Broomsticks,1970s adapted from:book angela lansbury animal...
4,185,0.012307,1270,Back to the Future,1950's 1950s 1955 1980's 1980s 1985 55 movies ...
5,1411,0.014232,3396,"Muppet Movie, The",55 movies every kid should see--entertainment ...
6,4708,0.015312,81456,Heartbeats,canadian cinematography french gay gay directo...
7,645,0.018969,1235,Harold and Maude,afi 100 (cheers) afi 100 (laughs) age differen...
8,2706,0.019076,6942,Love Actually,adultery airport alan rickman allen rickman au...
9,1410,0.019198,3363,American Graffiti,1950's 1960s 20th century afi #77 afi 100 afi ...
