# Stacked Auto Encoder

### Setup notebook

In [None]:
# Importing the libraries
import sys, os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.functional as F

from mymods.lauthom import *

### Data

In [None]:
get_path('*/*', 'movies')
get_path('*/*', 'users')
get_path('*/*', 'ratings')

In [None]:
def read_file(filename, **kwargs):
    """Get file path and read file"""
    from pathlib import Path
    fpath = list(Path('../../').glob('*/*/' + str(filename)))[0]
    return pd.read_csv(fpath, **kwargs)

In [None]:
# Importing the dataset
movies = read_file('movies.dat', sep='::', header=None, encoding='latin-1', names=['id', 'movie', 'cat'])
users = read_file('users.dat', sep='::', header=None, encoding='latin-1', names=['id', 'sex', 'unk1', 'unk2', 'unk3'])
ratings = read_file('ratings.dat', sep='::', header=None, encoding='latin-1', names=['user_id', 'movie_id', 'rating' , 'unk'])

In [None]:
movies.sample(10)
users.sample(10)
ratings.sample(10)

In [None]:
movies.info()
users.info()
ratings.info()

### Train test sets

In [None]:
get_path('*/*', 'u1')

In [None]:
df_train = read_file('../../_data/ml-100k/u1.base', delimiter='\t', header=None, names=['user_id', 'movie_id', 'rating', 'unk'])
df_test = read_file('../../_data/ml-100k/u1.test', delimiter='\t', header=None, names=['user_id', 'movie_id', 'rating', 'unk'])

In [None]:
df_train['test'] = False
df_test['test'] = True

In [None]:
df = pd.concat([df_train, df_test])
df.sample(10)

In [None]:
df.info()

In [None]:
# example ratings for user = 1
user_id = 1
mask = df['user_id'] == user_id
trn = df.loc[mask, :]
trn.sample(10)

### Unique users and movies in both train and test set

In [None]:
users = set(df['user_id'])
movies = set(df['movie_id'])
nb_users, nb_movies = len(users), len(movies)
nb_users, nb_movies

In [None]:
# Prepare for pivot and split
df.loc[df['test']==False, 'user_id'] = df.loc[df['test']==False, 'user_id'].values + 99000

# Pivot for RBM model
pv = df.pivot(index='user_id', columns='movie_id', values='rating')

# Change rating: negative/positive:
# nan: -1, 1-2: 0, 3-5: 1
mask_null = pv.isnull()
mask_3 = pv>=3

pv[mask_3] = 1
pv[~mask_3] = 0
pv[mask_null] = -1

# Split train test
pv_train = pv.loc[pv.index > 99000, :]
pv_train.index = pv_train.index - 99000
pv_test = pv.loc[pv.index < 99000, :]

pv_train.sample(10)
pv_test.sample(10)

In [None]:
pv_train.info()
pv_test.info()

In [None]:
np_train = np.array(pv_train, dtype='int')
np_test = np.array(pv_test, dtype='int')

np_train.shape
np_test.shape

#### Sanity check np.array & rating

In [None]:
print('rank train:', 'nan', sum(sum(np_train == -1)))
print('rank test:', 'nan', sum(sum(np_train == -1)))
    
for r in range(6):
    print('rank train:', r, sum(sum(np_train == r)))
    print('rank test:', r, sum(sum(np_train == r)))

### Converting the data into Torch tensors

In [None]:
training_set = torch.FloatTensor(np_train)
test_set = torch.FloatTensor(np_test)

In [None]:
training_set.shape
test_set.shape

### Build model

In [None]:
# Creating the architecture of the Stacked Auto Encoder
# inherit from Class nn
class SAE(nn.Module):
    def __init__(self, n_hl1, n_hl2, n_hl3):
        # initialize nn.Module(super of SAE)
        super(SAE, self).__init__()
        
        self.fc1 = nn.Linear(nb_movies, n_hl1)
        self.fc2 = nn.Linear(n_hl1, n_hl2)
        self.fc3 = nn.Linear(n_hl2, n_hl3)
        self.fc4 = nn.Linear(n_hl3, nb_movies)
        self.activation = nn.Sigmoid()
        
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x


### Compile model

In [None]:
# Define architecture
sae = SAE(128, 32, 128)

# define loss function
loss_fn = nn.MSELoss()

# define optimizer
def optimizer(name):
    opt = {'RMS': 'optim.RMSprop(sae.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0.25)', 
           'Adam': 'optim.Adam(sae.parameters(), lr=0.01, eps=1e-08, weight_decay=0.5)',
           'SGD': 'optim.SGD(sae.parameters(), lr=0.01, momentum=0.95)'}
    return eval(opt[name])

In [None]:
[x for x in training_set][:2]

### Train model

In [None]:
N_EPOCH = 50
# optimiser = optimizer('SGD') # static optimiser

for epoch in range(1, N_EPOCH + 1):
    train_loss = 0
    s = 0.
    
    for id_user in range(nb_users):
        inputs = Variable(training_set[id_user]).unsqueeze(0) # [torch.FloatTensor of size 1x1682]
        target = inputs[:]
        
        # train on users with rated movies only
        if torch.sum(target.data > -1).item() > 0:
            output = sae(inputs)
            target.require_grad = False # turn off gradient computation
            output[target == 0] = 0     # save computation cost
            
            # compute (MSE) loss - difference betweeen input and output
            # adjust trained/rated movies loss to all movies loss
            loss = loss_fn(output, target)
            mean_adjust = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            train_loss += np.sqrt(loss.item() * mean_adjust)
            s += 1.
            
            # Backprop loss and optimizer           
            optimize = optimizer('SGD') # dynamic optimiser
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            
    print('epoch: {:3} loss: {:.3f}'.format(epoch, train_loss/s))

In [None]:
df_test.iloc[1,:]

In [None]:
i = 1
# pv_train.loc[i,:]
np_new = np.array(pv_train.loc[i,:], dtype='int')
np_new.shape
new_data = torch.FloatTensor(np_new.reshape(1,-1))
new_data.shape
new_data = Variable(new_data).unsqueeze(0)
# new_data[0]
pred = sae(new_data).detach().numpy()[0][0]

In [None]:
# np_new += 1
orig = np_new[:] #.reshape(8, 210) #*255/2
pred = pred[:] #.reshape(8, 210)

import matplotlib.pyplot as plt
%matplotlib inline

_ = plt.figure(figsize=(20,20))
_ = plt.plot(range(len(orig)), orig)
_ = plt.plot(range(len(orig)), pred)
_ = plt.show()

# _ = plt.figure(figsize=(20,20))
# _ = plt.imshow(pred)