<a href="https://colab.research.google.com/github/Zhangjt9317/Laidata/blob/master/autoencoder_proj1_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download dataset

the dataset can be found under /content/autoencoder/data/ml-1m

In [1]:
!git clone https://github.com/tonylaioffer/autoencoder.git

Cloning into 'autoencoder'...
remote: Enumerating objects: 174, done.[K
remote: Total 174 (delta 0), reused 0 (delta 0), pack-reused 174[K
Receiving objects: 100% (174/174), 17.58 MiB | 15.30 MiB/s, done.
Resolving deltas: 100% (136/136), done.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Define data process methods

In [0]:
import tensorflow as tf
import os


def _get_training_data(FLAGS):  
    ''' Buildind the input pipeline for training and inference using TFRecords files.
    @return data only for the training
    @return data for the inference
    '''
    '''
    I guess here it's not read the entire dataset into memory in one time, and this is
    a lazy operation, need session to activate it, so in this operation, it rules certain
    actions in series:
    -create TFRecordDataset to read files
    -map this binary TFRecord dataset to contains feature 'movie_ratings'
    -shuffle it to randomly extract 500 in buffer each time
    -repeat this action infinite times ( i guess it's would end while all data are processed)
    -from buffer get a batch of data
    -prefetch one datapoint from the batch each time in order to iterative process data
    all above are actions, thus we can see in train stage, it initilize a iterator 
    
    '''
    
    filenames = [os.path.join(FLAGS['tf_records_train_path'], f) for f in os.listdir(FLAGS['tf_records_train_path'])]
    
    dataset = tf.data.TFRecordDataset(filenames)
    #Creates a TFRecordDataset to read one or more TFRecord files.
    dataset = dataset.map(parse)
    #Maps map_func across the elements of this dataset.
    #This transformation applies map_func to each element of this dataset, and returns a new dataset containing the transformed elements,
    #in the same order as they appeared in the input
    dataset = dataset.shuffle(buffer_size=500) #Randomly shuffles a tensor along its first dimension.
                                               #buffer_size representing the number of elements from this dataset from which the new dataset will sample.
    dataset = dataset.repeat()

    dataset = dataset.batch(FLAGS['batch_size'])
    #Combines consecutive elements of this dataset into batches.
    dataset = dataset.prefetch(buffer_size=1)
    #Creates a Dataset that prefetches elements from this dataset.
    
    '''
    dataset 2 is used to validation, here called infer
    shuffle with buffer size 1 and batch with size 1 is because for validation, we only need one datapoint each time
    to get corresponding prediction
    but for train, we use batch train to speed up
    '''
    dataset2 = tf.data.TFRecordDataset(filenames)
    dataset2 = dataset2.map(parse)
    dataset2 = dataset2.shuffle(buffer_size=1)

    dataset2 = dataset2.repeat()
    dataset2 = dataset2.batch(1)
    dataset2 = dataset2.prefetch(buffer_size=1)

    return dataset, dataset2


def _get_test_data(FLAGS):
    ''' Buildind the input pipeline for test data.'''

    filenames = [os.path.join(FLAGS['tf_records_test_path'], f) for f in os.listdir(FLAGS['tf_records_test_path'])]

    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(parse)
    dataset = dataset.shuffle(buffer_size=1)
    dataset = dataset.repeat()
    dataset = dataset.batch(1)
    dataset = dataset.prefetch(buffer_size=1)

    return dataset


def parse(serialized):
    ''' Parser for the TFRecords file.'''

    features = {'movie_ratings':tf.FixedLenFeature([3952], tf.float32),  
              }
    parsed_example = tf.parse_single_example(serialized,
                                           features=features,
                                           )
    movie_ratings = tf.cast(parsed_example['movie_ratings'], tf.float32)
    
    return movie_ratings

In [4]:
filenames = [os.path.join(FLAGS['tf_records_train_path'], f) for f in os.listdir(FLAGS['tf_records_train_path'])]
dataset = tf.data.TFRecordDataset(filenames)

NameError: ignored

In [0]:
dataset.map(parse)

## Define autoencoder architecture

In [0]:

# import model_helper


def _get_bias_initializer():
    return tf.zeros_initializer()


def _get_weight_initializer():
    return tf.random_normal_initializer(mean=0.0, stddev=0.05)


class DAE: #Data Acquisition Engine
    
    def __init__(self, FLAGS):
        ''' Implementation of deep autoencoder class.'''
        
        self.FLAGS = FLAGS
        self.weight_initializer = _get_weight_initializer()
        self.bias_initializer = _get_bias_initializer()
        self.init_parameters()
        

    def init_parameters(self):
        '''Initialize networks weights and biasis.'''
        
        with tf.name_scope('weights'):
          #This context manager validates that the given values are from the same graph,  
          #makes that graph the default graph, and pushes a name scope in that graph
            self.W_1 = tf.get_variable(name='weight_1', shape=(self.FLAGS['num_v'], self.FLAGS['num_h']),
                                       initializer=self.weight_initializer) #Gets an existing variable with these parameters or create a new one
            self.W_2 = tf.get_variable(name='weight_2', shape=(self.FLAGS['num_h'], self.FLAGS['num_h']),
                                       initializer=self.weight_initializer)
            self.W_3 = tf.get_variable(name='weight_3', shape=(self.FLAGS['num_h'], self.FLAGS['num_h']),
                                       initializer=self.weight_initializer)
            self.W_4 = tf.get_variable(name='weight_4', shape=(self.FLAGS['num_h'], self.FLAGS['num_v']),
                                       initializer=self.weight_initializer)
            # self.W_5 = tf.get_variable(name='weight_5', shape=(self.FLAGS['num_h'], self.FLAGS['num_v']),
            #                            initializer=self.weight_initializer)
            
        with tf.name_scope('biases'):
            self.b1 = tf.get_variable(name='bias_1', shape=(self.FLAGS['num_h']),
                                      initializer=self.bias_initializer)
            self.b2 = tf.get_variable(name='bias_2', shape=(self.FLAGS['num_h']),
                                      initializer=self.bias_initializer)
            self.b3 = tf.get_variable(name='bias_3', shape=(self.FLAGS['num_h']),
                                      initializer=self.bias_initializer)
            # self.b4 = tf.get_variable(name='bias_3', shape=(self.FLAGS['num_h']),
            #                           initializer=self.bias_initializer)
    def _inference(self, x):
        ''' Making one forward pass. Predicting the networks outputs.
        @param x: input ratings
        
        @return : networks predictions
        '''\
        
        with tf.name_scope('inference'):
          a1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(x, self.W_1),self.b1)) # sign(W1T*X+b1)
          a2 = tf.nn.relu(tf.nn.bias_add(tf.matmul(a1, self.W_2),self.b2))
          a3 = tf.nn.relu(tf.nn.bias_add(tf.matmul(a2, self.W_3),self.b3))
          a4 = tf.matmul(a3, self.W_4)
          # a4 = tf.nn.relu(tf.nn.bias_add(tf.matmul(a3, self.W_4), self.b4))   
          # a5 = tf.matmul(a3, self.W_5)
        return a4
    
    def _compute_loss(self, predictions, labels, num_labels):
        ''' Computing the Mean Squared Error loss between the input and output of the network.
            
          @param predictions: predictions of the stacked autoencoder
          @param labels: input values of the stacked autoencoder which serve as labels at the same time
          @param num_labels: number of labels !=0 in the data set to compute the mean
            
          @return mean squared error loss tf-operation
          '''
            
        with tf.name_scope('loss'):
            loss_op = tf.div(tf.reduce_sum(tf.square(tf.subtract(predictions,labels))),num_labels)
            return loss_op
          
        

    def _optimizer(self, x):
        '''Optimization of the network parameter through stochastic gradient descent.
            
            @param x: input values for the stacked autoencoder.
            
            @return: tensorflow training operation
            @return: ROOT!! mean squared error
        '''
        
        outputs = self._inference(x)

        mask = tf.where(tf.equal(x, 0.0), tf.zeros_like(x), x) # indices of 0 values in the training set
        # tf.zero_like : Creates a tensor with all elements set to zero.
        # The condition tensor acts as a mask that chooses, based on the value at each element, 
        # whether the corresponding element / row in the output should be taken from x (if true) or y (if false).
        num_train_labels = tf.cast(tf.count_nonzero(mask), dtype=tf.float32) # number of non zero values in the training set
        bool_mask = tf.cast(mask,dtype=tf.bool) # boolean mask
        outputs = tf.where(bool_mask, outputs, tf.zeros_like(outputs)) # set the output values to zero if corresponding input values are zero


        MSE_loss = self._compute_loss(outputs,x,num_train_labels)
        
        if self.FLAGS['l2_reg'] == True:
            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # Returns all variables created with trainable=True.
            MSE_loss = MSE_loss +  self.FLAGS['lambda_'] * l2_loss
        
        train_op = tf.train.AdamOptimizer(self.FLAGS['learning_rate']).minimize(MSE_loss) #An Operation that updates the variables in var_list
        RMSE_loss = tf.sqrt(MSE_loss)

        return train_op, RMSE_loss
    
    def _validation_loss(self, x_train, x_test):
        ''' Computing the loss during the validation time.
            
          @param x_train: training data samples
          @param x_test: test data samples
            
          @return networks predictions
          @return root mean squared error loss between the predicted and actual ratings
          '''
        
        outputs = self._inference(x_train) # use training sample to make prediction
        mask = tf.where(tf.equal(x_test,0.0), tf.zeros_like(x_test), x_test) # identify the zero values in the test ste
        num_test_labels = tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # count the number of non zero values
        bool_mask = tf.cast(mask,dtype=tf.bool) 
        outputs = tf.where(bool_mask, outputs, tf.zeros_like(outputs))
    
        MSE_loss = self._compute_loss(outputs, x_test, num_test_labels)
        RMSE_loss = tf.sqrt(MSE_loss)
            
        return outputs, RMSE_loss

## Train model

In [0]:
import numpy as np


def train(FLAGS):
    '''Building the graph, opening of a session and starting the training od the neural network.'''
    
    num_batches = int(FLAGS['num_samples']/FLAGS['batch_size'])

    with tf.Graph().as_default():

        train_data, train_data_infer = _get_training_data(FLAGS)
        test_data = _get_test_data(FLAGS)
        
        iter_train = train_data.make_initializable_iterator()
        #Creates a tf.data.Iterator for enumerating the elements of a dataset.
        iter_train_infer = train_data_infer.make_initializable_iterator()
        iter_test = test_data.make_initializable_iterator()
        
        x_train = iter_train.get_next() #Returns a nested structure of tf.Tensors representing the next element.
        x_train_infer = iter_train_infer.get_next()
        x_test = iter_test.get_next()

        model = DAE(FLAGS)

        train_op, train_loss_op = model._optimizer(x_train)
        pred_op, test_loss_op = model._validation_loss(x_train_infer, x_test)
       
        with tf.Session() as sess: #A class for running TensorFlow operations
            
            sess.run(tf.global_variables_initializer())
            train_loss = 0
            test_loss = 0

            for epoch in range(FLAGS['num_epoch']):
                
                sess.run(iter_train.initializer) #The returned iterator will be in an uninitialized state, 
                                                 #and you must run the iterator.initializer operation before using it
                
                for batch_nr in range(num_batches):
                    
                    _, loss_ = sess.run((train_op, train_loss_op))
                    train_loss += loss_
              
                sess.run(iter_train_infer.initializer)
                sess.run(iter_test.initializer)

                for i in range(FLAGS['num_samples']):
                    pred, loss_ = sess.run((pred_op, test_loss_op))
                    test_loss += loss_
                    
                print('epoch_nr: %i, train_loss: %.3f, test_loss: %.3f'%(epoch,(train_loss/num_batches), (test_loss/FLAGS['num_samples'])))
                train_loss = 0
                test_loss = 0

In [0]:
FLAGS = {'tf_records_train_path': '/content/autoencoder/data/ml-1m/train/',  # Path of the training data
         'tf_records_test_path': '/content/autoencoder/data/ml-1m/test/',  # Path of the test data
         'num_epoch': 100,  # Number of training epochs
         'batch_size': 16,  # Size of the training batch
         'learning_rate': 5e-4,  # Learning_Rate
         'l2_reg': False,  # L2 regularization
         'lambda_': 0.01,  # Wight decay factor
         'num_v': 3952,  # Number of visible neurons (Number of movies the users rated.
         'num_h': 128,  # Number of hidden neurons
         'num_samples': 5953}  # Number of training samples (Number of users, who gave a rating)


train(FLAGS)