# Deep Belief Network

## Import Libraries & Dataset

In [44]:
import tensorflow.compat.v1 as tf 
tf.disable_v2_behavior()

In [45]:
#import tensorflow as tf #Deep learning Library
import numpy as np #Matrix Algebra Library

In [46]:
#Getting the MNIST data provided by Tensorflow
'''
from tensorflow.examples.tutorials.mnist import input_data

#Loading in the mnist data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=False)
trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images,\
    mnist.test.labels
'''

'\nfrom tensorflow.examples.tutorials.mnist import input_data\n\n#Loading in the mnist data\nmnist = input_data.read_data_sets("MNIST_data/", one_hot=False)\ntrX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images,    mnist.test.labels\n'

In [47]:
#Getting the MNIST data provided by Tensorflow
from tensorflow import keras 
mnist = keras.datasets.mnist

#Loading in the mnist data
(trX, trY) , (teX, teY) = mnist.load_data()
print(trX.shape, trY.shape, teX.shape, teY.shape)
trX = trX.reshape(trX.shape[0], trX.shape[1] * trX.shape[2])
teX = teX.reshape(teX.shape[0], teX.shape[1] * teX.shape[2])
print(trX.shape, trY.shape, teX.shape, teY.shape)

(60000, 28, 28) (60000,) (10000, 28, 28) (10000,)
(60000, 784) (60000,) (10000, 784) (10000,)


## Restricted Boltzmann Machines

RBMs are the building blocks of deep belief nets. If you are not familiar with RBMs please check out my post describing how the RBM functions https://github.com/JosephGatto/Simplified-Restricted-Boltzmann-Machines 

In [48]:
class RBM(object):
    def __init__(self, input_size, output_size, learning_rate, batch_size):
        self.input_size = input_size #Size of the input layer
        self.output_size = output_size #Size of the hidden layer
        self.epochs = 5 #How many times we will update the weights 
        self.learning_rate = learning_rate #How big of a weight update we will perform 
        self.batch_size = batch_size #How many images will we "feature engineer" at at time 
        self.new_input_layer = None #Initalize new input layer variable for k-step contrastive divergence 
        self.new_hidden_layer = None
        self.new_test_hidden_layer = None
        
        #Here we initialize the weights and biases of our RBM
        #If you are wondering, the 0 is the mean of the distribution we are getting our random weights from. 
        #The .01 is the standard deviation.
        self.w = np.random.normal(0,.01,[input_size,output_size]) #weights
        self.hb = np.random.normal(0,.01,[output_size]) #hidden layer bias
        self.vb = np.random.normal(0,.01,[input_size]) #input layer bias (sometimes called visible layer)
        
        
        #Calculates the sigmoid probabilities of input * weights + bias
        #Here we multiply the input layer by the weights and add the bias
        #This is the phase that creates the hidden layer
    def prob_h_given_v(self, visible, w, hb):
        return tf.nn.sigmoid(tf.matmul(visible, w) + hb)
        
        #Calculates the sigmoid probabilities of input * weights + bias
        #Here we multiply the hidden layer by the weights and add the input layer bias
        #This is the reconstruction phase that recreates the original image from the hidden layer
    def prob_v_given_h(self, hidden, w, vb):
        return tf.nn.sigmoid(tf.matmul(hidden, tf.transpose(w)) + vb)
    
    #Returns new layer binary values
    #This function returns a 0 or 1 based on the sign of the probabilities passed to it
    #Our RBM will be utilizing binary features to represent the images
    #This function just converts the features we have learned into a binary representation 
    def sample_prob(self, probs):
        return tf.nn.relu(tf.sign(probs - tf.random_uniform(tf.shape(probs))))
    
    def train(self, X, teX):
        #Initalize placeholder values for graph
        #If this looks strange to you, then you have not used Tensorflow before
        _w = tf.placeholder(tf.float32, shape = [self.input_size, self.output_size])
        _vb = tf.placeholder(tf.float32, shape = [self.input_size])
        _hb = tf.placeholder(tf.float32, shape = [self.output_size])
        print(f"Shapes: _w = {_w.shape} _vb = {_vb.shape} _hb = {_hb.shape}")
        
        
        #initalize previous variables
        #we will be saving the weights of the previous and current iterations
        pre_w = np.random.normal(0,.01, size = [self.input_size,self.output_size])
        pre_vb = np.random.normal(0, .01, size = [self.input_size])
        pre_hb = np.random.normal(0, .01, size = [self.output_size])
        
        #initalize current variables
        #we will be saving the weights of the previous and current iterations
        cur_w = np.random.normal(0, .01, size = [self.input_size,self.output_size])
        cur_vb = np.random.normal(0, .01, size = [self.input_size])
        cur_hb = np.random.normal(0, .01, size = [self.output_size])
               
        #Plaecholder variable for input layer
        v0 = tf.placeholder(tf.float32, shape = [None, self.input_size])
         
        #pass probabilities of input * w + b into sample prob to get binary values of hidden layer
        h0 = self.sample_prob(self.prob_h_given_v(v0, _w, _hb ))
        
        #pass probabilities of new hidden unit * w + b into sample prob to get new reconstruction
        v1 = self.sample_prob(self.prob_v_given_h(h0, _w, _vb))
        
        #Just get the probailities of the next hidden layer. We wont need the binary values. 
        #The probabilities here help calculate the gradients during back prop 
        h1 = self.prob_h_given_v(v1, _w, _hb)
        
        
        #Contrastive Divergence
        positive_grad = tf.matmul(tf.transpose(v0), h0) #input' * hidden0
        negative_grad = tf.matmul(tf.transpose(v1), h1) #reconstruction' * hidden1
        #(pos_grad - neg_grad) / total number of input samples 
        CD = (positive_grad - negative_grad) / tf.to_float(tf.shape(v0)[0]) 
        
        #This is just the definition of contrastive divergence 
        update_w = _w + self.learning_rate * CD
        update_vb = _vb + tf.reduce_mean(v0 - v1, 0)
        update_hb = _hb + tf.reduce_mean(h0 - h1, 0)
        
        #MSE - This is our error function
        err = tf.reduce_mean(tf.square(v0 - v1))
        
        #Will hold new visible layer.
        errors = []
        hidden_units = []
        reconstruction = []
        
        test_hidden_units = []
        test_reconstruction=[]
        
        
        #The next four lines of code intitalize our Tensorflow graph and create mini batches
        #The mini batch code is from cognitive class. I love the way they did this. Just giving credit! 
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(self.epochs):
                for start, end in zip(range(0, len(X), self.batch_size), range(self.batch_size, len(X), self.batch_size)):
                    batch = X[start:end] #Mini batch of images taken from training data
                    
                    #Feed in batch, previous weights/bias, update weights and store them in current weights
                    cur_w = sess.run(update_w, feed_dict = {v0:batch, _w:pre_w , _vb:pre_vb, _hb:pre_hb})
                    cur_hb = sess.run(update_hb, feed_dict = {v0:batch, _w:pre_w , _vb:pre_vb, _hb:pre_hb})
                    cur_vb = sess.run(update_vb, feed_dict = {v0:batch, _w:pre_w , _vb:pre_vb, _hb:pre_hb})
                    
                    #Save weights 
                    pre_w = cur_w
                    pre_hb = cur_hb
                    pre_vb = cur_vb
                
                #At the end of each iteration, the reconstructed images are stored and the error is outputted 
                reconstruction.append(sess.run(v1, feed_dict={v0: X, _w: cur_w, _vb: cur_vb, _hb: cur_hb}))        
                print('Learning Rate: {}:  Batch Size: {}:  Hidden Layers: {}: Epoch: {}: Error: {}:'.format(self.learning_rate, self.batch_size, 
                                                                                                             self.output_size, (epoch+1),
                                                                                                            sess.run(err, feed_dict={v0: X, _w: cur_w, _vb: cur_vb, _hb: cur_hb})))
            
            #Store final reconstruction in RBM object
            self.new_input_layer = reconstruction[-1]
            
            #Store weights in RBM object
            self.w = pre_w
            self.hb = pre_hb
            self.vb = pre_vb
    
    #This is used for Contrastive Divergence.
    #This function makes the reconstruction your new input layer. 
    def rbm_output(self, X):
        input_x = tf.constant(X)
        _w = tf.constant(self.w)
        _hb = tf.constant(self.hb)
        _vb = tf.constant(self.vb)
        
        out = tf.nn.sigmoid(tf.matmul(input_x, _w) + _hb)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            return sess.run(out)
            

## Deep Belief Networks

Deep belief nets are, in my opinion, pretty easy to understand if you have a solid grasp of RBMs. Remember, the RBM's goal is to reconstruct the original image with its hidden layer representation. The deep belief network is just a chain of RBMs. Once the first RBM has completed the image reconstruction process, a new RBM is created. The input layer of the new RBM is now the hidden layer representation of the previous RBM. We now, repeat this process as many times as we like to produce a DBN. DBNs can often give you higher level feature abstractions and produce very high quality features.

Below is an example of how to use multiple RBMs to create a DBN. Here we will make 3 RBMs of hidden layer size [600 > 500 > 100]. Feel free to extract the new_hidden_layer from any of the RBMs to see how they perform as features in a lienar classifier. 

In [49]:
RBM_hidden_size = [600,500,100] #Three hidden layer sizes for our three layer DBN
learning_rate = .01 
input_size = trX.shape[1] #input layer size of original image data

rbm_list = [] #This will hold all of the RBMs used in our DBN

#Creates 3 RBMs
for layer in RBM_hidden_size:
    rbm_list.append(RBM(input_size, layer, learning_rate, 32))
    input_size = layer

In [50]:
#Initalize input layer variables 
inpX = trX                
test_inpx = teX

#This loop is the DBN. Each rbm is trained here.
#At the end of training, the hidden layer of the RBM is used as input
#For the next layer of the DBN. 
for i,rbm in enumerate(rbm_list):
    rbm_outputs = []
    rbm_test_outputs = []
    print('Input Shape: ', inpX.shape)
    print('Layer: ',(i+1))

    rbm.train(inpX, teX)
    inpX = rbm.rbm_output(inpX)
    test_inpx = rbm.rbm_output(test_inpx)
    rbm_outputs.append(inpX)
    rbm_test_outputs.append(test_inpx)

    print('Output Shape: ', inpX.shape)
    print()

Input Shape:  (60000, 784)
Layer:  1
Shapes: _w = (784, 600) _vb = (784,) _hb = (600,)
Learning Rate: 0.01:  Batch Size: 32:  Hidden Layers: 600: Epoch: 1: Error: 7217.080078125:


KeyboardInterrupt: 