# Auto Encoder

This notebook was created by Camille-Amaury JUGE, in order to better understand Auto Encoder principles and how they work.

(it follows the exercices proposed by Hadelin de Ponteves on Udemy : https://www.udemy.com/course/le-deep-learning-de-a-a-z/)

## Imports

In [23]:
import numpy as np
import pandas as pd
# pytorch
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

import sys
import csv

## Data preprocessing

same process as Boltzmann's machine (go there to see more details)

In [14]:
df_movies = pd.read_csv("ml-1m\\movies.dat", sep="::", header=None, engine="python",
                encoding="latin-1")
users = pd.read_csv("ml-1m\\users.dat", sep="::", header=None, engine="python",
                encoding="latin-1")
ratings = pd.read_csv("ml-1m\\ratings.dat", sep="::", header=None, engine="python",
                encoding="latin-1")

In [15]:
df_train = pd.read_csv("ml-100k\\u1.base", delimiter="\t", header=None)
df_test = pd.read_csv("ml-100k\\u1.test", delimiter="\t", header=None)

In [16]:
_users = list(set(np.concatenate((df_train[df_train.columns[0]].value_counts().index, 
                           df_test[df_test.columns[0]].value_counts().index), 
                          axis=0)))

In [17]:
_movies =  list(set(np.concatenate((df_train[df_train.columns[1]].value_counts().index, 
                           df_test[df_test.columns[1]].value_counts().index), 
                          axis=0)))

In [18]:
def createMatrix(df, users, movies):
    matrix = []
    movies_nb = len(movies)
    user_nb = len(users)
    df_array = np.array(df, dtype="int")
    for i,user in enumerate(users):
        filtered_movies = df_array[df_array[:,0] == user, 1]
        filtered_ratings = df_array[df_array[:,0] == user, 2]
        ratings = np.zeros(movies_nb)
        for j in range(len(filtered_movies)):
            ratings[filtered_movies[j] - 1] = filtered_ratings[j]
        matrix.append(ratings)
                      
        sys.stdout.write("\r Loading State : {} / {}".format(i+1,user_nb))
        sys.stdout.flush()
        
    return matrix

In [19]:
matrix_train = createMatrix(df_train, _users, _movies)
matrix_test = createMatrix(df_test, _users, _movies)

 Loading State : 943 / 943

In [20]:
train = torch.FloatTensor(matrix_train)
test = torch.FloatTensor(matrix_test) 

In [27]:
train.shape

torch.Size([943, 1682])

## Model

In [79]:
class SparseAutoEncoder(nn.Module):
    def __init__(self, input_dim):
        super(SparseAutoEncoder, self).__init__()
        # creating input layer
        self.fully_connected_hidden_layer_1 = nn.Linear(input_dim, 20)
        self.fully_connected_hidden_layer_2 = nn.Linear(20, 10)
        self.fully_connected_hidden_layer_3 = nn.Linear(10, 20)
        self.fully_connected_hidden_layer_4 = nn.Linear(20, input_dim)
        self.activation = nn.Sigmoid()
        self.optimizer = optim.RMSprop(self.parameters(), lr=0.01, weight_decay=0.5)
        self.loss = nn.MSELoss()
        
    def forward(self, X):
        return self.fully_connected_hidden_layer_4(
            self.activation(self.fully_connected_hidden_layer_3(
                self.activation(self.fully_connected_hidden_layer_2(
                    self.activation(self.fully_connected_hidden_layer_1(X)))))))
    
    def train_(self, X, epoch):
        self.X_train = X
        for i in range(epoch):
            print("Epoch => {}/{}".format(i+1,epoch))
            train_loss = 0
            s = 0.
            for j in range(self.X_train.shape[0]):
                batch = Variable(self.X_train[j]).unsqueeze(0)
                target = batch.clone()
                if torch.sum(target.data > 0) > 0:
                    output = self(batch)
                    target.require_grad = False
                    output[target == 0] = 0
                    temp_loss = self.loss(output, target)
                    
                    mean_corrector = self.X_train.shape[1] / (float(torch.sum(target.data > 0)) + 1e-10)
                    temp_loss.backward()
                    train_loss += np.sqrt(temp_loss.item() * mean_corrector)
                    s+=1.
                    self.optimizer.step()
            print("   => Loss : {}".format((train_loss/s)))
            
    def test_(self, X):
        test_loss = 0
        s = 0.
        sys.stdout.write("\r Processing")
        sys.stdout.flush()
        
        for j in range(self.X_train.shape[0]):
            batch = Variable(self.X_train[j]).unsqueeze(0)
            target = Variable(X[j]).unsqueeze(0)
            if torch.sum(target.data > 0) > 0:
                output = self(batch)
                target.require_grad = False
                output[target == 0] = 0
                temp_loss = self.loss(output, target)
                    
                mean_corrector = self.X_train.shape[1] / (float(torch.sum(target.data > 0)) + 1e-10)
                test_loss += np.sqrt(temp_loss.item() * mean_corrector)
                s+=1.
        sys.stdout.write("\r Test Set => Loss : {}".format((test_loss/s)))
        sys.stdout.flush()
            
    
        

In [80]:
sae = SparseAutoEncoder(train.shape[1])

In [81]:
sae.train_(train, 20)

Epoch => 1/20
   => Loss : 1.7715910420976406
Epoch => 2/20
   => Loss : 1.0966187315622766
Epoch => 3/20
   => Loss : 1.0534908873056288
Epoch => 4/20
   => Loss : 1.0380864423484002
Epoch => 5/20
   => Loss : 1.0311407656719527
Epoch => 6/20
   => Loss : 1.0265132566564796
Epoch => 7/20
   => Loss : 1.0239976540198936
Epoch => 8/20
   => Loss : 1.0220266959738937
Epoch => 9/20
   => Loss : 1.0209420041093658
Epoch => 10/20
   => Loss : 1.0196439537372004
Epoch => 11/20
   => Loss : 1.0189271599642897
Epoch => 12/20
   => Loss : 1.0183032493250952
Epoch => 13/20
   => Loss : 1.0178964247724989
Epoch => 14/20
   => Loss : 1.0173872598783607
Epoch => 15/20
   => Loss : 1.0172698467725836
Epoch => 16/20
   => Loss : 1.0166608819642282
Epoch => 17/20
   => Loss : 1.0168078470610282
Epoch => 18/20
   => Loss : 1.0165371745710432
Epoch => 19/20
   => Loss : 1.0163025495834048
Epoch => 20/20
   => Loss : 1.015942291449781


In [82]:
sae.test_(test)

 Test Set => Loss : 1.0229144248873956