# AutoEncoder Meet Collaborative Filtering

- Collaborative Filtering을 위해 user-item matrix 만들기
- AutoEncoder 모델 구조 정의하기

* Training Deep AutoEncoder 논문은 [저자 코드](https://github.com/NVIDIA/DeepRecommender) 참고

## 논문 종류
- AutoRec
- Training Deep AutoEncoder
- Variational AutoEncoder

## 1. Data Loader

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
data_path = '/content/drive/MyDrive/Colab Notebooks/RS/04-Recommender-System-with-DeepLearning/kmrd/kmr_dataset/datafile/kmrd-small'

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [4]:
def read_data(data_path):
    df = pd.read_csv(os.path.join(data_path,'rates.csv'))[:10000]
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=1234, shuffle=True)

    user_to_index = {original: idx for idx, original in enumerate(df.user.unique())}
    movie_to_index = {original: idx for idx, original in enumerate(df.movie.unique())}

    return train_df, val_df, user_to_index, movie_to_index

In [5]:
class KMRDdataset(Dataset):
    def __init__(self, df, user_to_index, movie_to_index, item_based=True):
        self.min_rating = min(df.rate)
        self.max_rating = max(df.rate)

        self.user = [user_to_index[u] for u in df.user.values]
        self.movie = [movie_to_index[m] for m in df.movie.values]
        self.rating = df.rate.values

        if item_based:
          input_tensor = torch.LongTensor([self.movie, self.user])
          self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(movie_to_index), len(user_to_index)])).to_dense()
        else:
          input_tensor = torch.LongTensor([self.user, self.movie])
          self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(user_to_index), len(movie_to_index)])).to_dense()


    def __len__(self):
      return len(self.data)
    
    def __getitem__(self, idx):
      return self.data[idx]


In [6]:
data_path = '/content/drive/MyDrive/Colab Notebooks/RS/04-Recommender-System-with-DeepLearning/kmrd/kmr_dataset/datafile/kmrd-small'
train_df, val_df, user_to_index, movie_to_index = read_data(data_path=data_path)

In [7]:
train_dataset = KMRDdataset(train_df, user_to_index, movie_to_index)
val_dataset = KMRDdataset(val_df, user_to_index, movie_to_index)

In [8]:
print(train_df.shape)
print(train_dataset.data[0].size())
print(val_df.shape)
print(val_dataset.data[0].size())

(8000, 4)
torch.Size([466])
(2000, 4)
torch.Size([466])


In [9]:
print(len(list(user_to_index.keys())))

466


In [10]:
train_dataset.data[0]

tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0., 10.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0., 27.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  8.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  8.,  9.,  0., 10.,  0.,  9.,  0.,  0.,
         0.,  0.,  5.,  0.,  0.,  0.,  0., 10.,  0.,  0.,  0.,  9.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  9.,  0.,
         0.,  0.,  0.,  0.,  0., 10.,  0.,  1.,  0.,  0.,  0., 10.,  0.,  0.,
         0.,  0.,  0.,  0.,  9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  8.,  0.,  0., 10.,  0.,  0., 10.,  0.,  0.,  0.,
         0.,  0., 10.,  0.,  0.,  0.,  0.,  0.,  9.,  0.,  0., 1

In [34]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

## 2. Define AutoEncoder 

In [12]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

In [13]:
class SimpleAutoEncoder(nn.Module):
  def __init__(self, num_inputs, num_hiddens, kind='sigmoid', dropout=None):
    super(SimpleAutoEncoder, self).__init__()
    # encoder -> hidden -> decoder
    # input -> hidden -> output
    # input -> hidden : encoder
    # hidden -> output = input : decoder
    self.encoder = nn.Sequential(nn.Linear(num_inputs, num_hiddens), self.activation(kind))
    self.decoder = nn.Sequential(nn.Linear(num_hiddens, num_inputs), self.activation(kind))  

  def activation(self, kind):
    if kind == 'selu':
      return nn.SELU()
    elif kind == 'relu':
      return nn.ReLU()
    elif kind == 'relu6':
      return nn.ReLU6()
    elif kind == 'sigmoid':
      return nn.Sigmoid()
    elif kind == 'tanh':
      return nn.Tanh()
    elif kind == 'elu':
      return nn.ELU()
    elif kind == 'lrelu':
      return nn.LeakyReLU()
    elif kind == 'none':
      return input
    else:
      raise ValueError('Unknown non-linearity type')

  def forward(self, x):
    return self.decoder(self.encoder(x))

In [35]:
class DeepAutoEncoder(nn.Module):
  def __init__(self, num_hiddens, num_layers, dropout=None, nn_type='diamond'):
    super(DeepAutoEncoder, self).__init__()
    # input -> hidden -> output
    # input -> hidden(10) -> ... -> hidden(10) -> output = input
    self.encoder, self.decoder = self.generate_layers(num_hiddens, num_layers, dropout, nn_type)
  
  def forward(self, x):
    return self.decoder(self.encoder(x))
  
  def generate_layers(self, num_hiddens, num_layers, dropout=None, nn_type='diamond'):
    # hidden layers -> [50, 25, 12, 6, 12, 25, 50], [100 50 100] -> 100, 50, 60, 50 100 
    if nn_type == 'diamond':
      encoder_modules = []
      decoder_modules = []

      hidden_layers = []
      temp = num_hiddens
      for idx, x in enumerate(range(num_layers)):
        if idx == 0:
          hidden_layers.append(temp)
        else:
          hidden_layers.append(int(temp/2))
        temp = temp/2
      hidden_layers = [x for x in hidden_layers if x > 10]
      
      # encoder
      for idx, num_hidden in enumerate(hidden_layers):
        if idx < len(hidden_layers)-1:
          encoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
          encoder_modules.append(nn.Sigmoid())

      # decoder
      hidden_layers = list(reversed(hidden_layers))
      for idx, num_hidden in enumerate(hidden_layers):
        if idx < len(hidden_layers)-1:
          decoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
          decoder_modules.append(nn.Identity())

    # num_hidden = 50, num_layers = 3 ->  input_dim -> [50, 50, 50] -> output_dim = input_dim 
    elif nn_type == 'constant':
      hidden_layers = [num_hiddens] * num_layers
      for idx, enc in enumerate(hidden_layers):
        if idx < num_layers-1:
          encoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
          encoder_modules.append(nn.Sigmoid())
          decoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
          decoder_modules.append(nn.Identity())

    if dropout is not None:    
      encoder_modules = [x for y in (encoder_modules[i:i+2] + [nn.Dropout(dropout)] * (i < len(encoder_modules) - 1) 
                          for i in range(0, len(encoder_modules), 2)) for x in y]
      decoder_modules = [x for y in (decoder_modules[i:i+2] + [nn.Dropout(dropout)] * (i < len(decoder_modules) - 1)
                          for i in range(0, len(decoder_modules), 2)) for x in y]

    encoder = nn.Sequential(*encoder_modules)
    decoder = nn.Sequential(*decoder_modules)
    
    return encoder, decoder

## Train

In [36]:
num_users = len(user_to_index.keys())
num_movies = len(movie_to_index.keys())
print(num_users, num_movies)

466 532


In [51]:
model = DeepAutoEncoder(num_layers=10, num_hiddens=num_users, dropout=True, nn_type='diamond')
model

DeepAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=466, out_features=116, bias=True)
    (1): Sigmoid()
    (2): Dropout(p=True, inplace=False)
    (3): Linear(in_features=116, out_features=58, bias=True)
    (4): Sigmoid()
    (5): Dropout(p=True, inplace=False)
    (6): Linear(in_features=58, out_features=29, bias=True)
    (7): Sigmoid()
    (8): Dropout(p=True, inplace=False)
    (9): Linear(in_features=29, out_features=14, bias=True)
    (10): Sigmoid()
    (11): Dropout(p=True, inplace=False)
  )
  (decoder): Sequential(
    (0): Linear(in_features=14, out_features=29, bias=True)
    (1): Identity()
    (2): Dropout(p=True, inplace=False)
    (3): Linear(in_features=29, out_features=58, bias=True)
    (4): Identity()
    (5): Dropout(p=True, inplace=False)
    (6): Linear(in_features=58, out_features=116, bias=True)
    (7): Identity()
    (8): Dropout(p=True, inplace=False)
    (9): Linear(in_features=116, out_features=466, bias=True)
    (10): Identity()
    

In [43]:
model2 = SimpleAutoEncoder(num_inputs=num_users, num_hiddens=100, kind='selu')
model2

SimpleAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=466, out_features=100, bias=True)
    (1): SELU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=466, bias=True)
    (1): SELU()
  )
)

In [52]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [53]:
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(weights_init)

DeepAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=466, out_features=116, bias=True)
    (1): Sigmoid()
    (2): Dropout(p=True, inplace=False)
    (3): Linear(in_features=116, out_features=58, bias=True)
    (4): Sigmoid()
    (5): Dropout(p=True, inplace=False)
    (6): Linear(in_features=58, out_features=29, bias=True)
    (7): Sigmoid()
    (8): Dropout(p=True, inplace=False)
    (9): Linear(in_features=29, out_features=14, bias=True)
    (10): Sigmoid()
    (11): Dropout(p=True, inplace=False)
  )
  (decoder): Sequential(
    (0): Linear(in_features=14, out_features=29, bias=True)
    (1): Identity()
    (2): Dropout(p=True, inplace=False)
    (3): Linear(in_features=29, out_features=58, bias=True)
    (4): Identity()
    (5): Dropout(p=True, inplace=False)
    (6): Linear(in_features=58, out_features=116, bias=True)
    (7): Identity()
    (8): Dropout(p=True, inplace=False)
    (9): Linear(in_features=116, out_features=466, bias=True)
    (10): Identity()
    

In [54]:
train_dataset.data[0].size()

torch.Size([466])

In [55]:
# NVIDIA Recommender System 참고
def MSEloss(inputs, targets, size_average=False):
  mask = targets != 0
  num_ratings = torch.sum(mask.float())
  criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
  return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_ratings

In [66]:
model.train()
train_loss = 0
for idx, batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    
    pred = model(batch)
    loss, num_ratings = MSEloss(pred, batch)    
    loss = torch.sqrt(loss / num_ratings)
    loss.backward()
    train_loss += loss.item() 
    optimizer.step()
    
    print(train_loss / (idx+1))

8.849576950073242
10.321973323822021
9.779138565063477
10.880092144012451
10.501428985595703
10.908060868581137
11.028157370431083
11.306012988090515
11.011011759440104


In [60]:
model.eval()
val_loss = 0
with torch.no_grad():
  for idx, batch in enumerate(val_dataloader):
    pred = model(batch)
    loss, num_ratings = MSEloss(pred, batch)
    loss = torch.sqrt(loss / num_ratings)
    val_loss += loss.item()

    print(val_loss/(idx+1))

8.545600891113281
8.6409330368042
8.419007778167725
8.39169156551361
8.266732788085937
8.086090405782064
8.05605663572039
7.989835739135742
7.995421091715495
7.934514713287354
7.926532181826505
7.961360335350037
8.080863549159123
8.105555977140154
8.053922080993653
8.16537618637085
8.258162330178653
8.327246718936497
8.533296083149157
8.492086553573609
8.513437452770415
8.479362487792969
8.447768895522408
8.426644285519918
8.377427406311035
8.435168229616606
8.440555219297055
8.401682342801776
8.439054653562348
8.43708709081014
8.527448777229555
8.49030926823616
8.500199115637576
8.426535676507388
