# Discriminative Restricted Boltzmann Machines TensorFlow Implementation

In [1]:
import tensorflow as tf
import numpy as np
import random
import scipy.misc
from logging import getLogger
import datetime
import dateutil.tz
from datetime import date

import os
import sys
import urllib
import pprint
import tarfile

import scipy.misc

import csv
import os

In [2]:
class Model:
    def __init__(self, sess, conf, checkpoint_fname=None):
        self.sess = sess
        
        # Model input parameters
        self.num_hidden = conf.num_hidden
        self.num_visible = conf.num_visible
        self.num_classes = conf.num_classes
        
        # Learning hyperparameters
        self.hparams = {}
        self.hparams['batch_size'] = conf.batch_size
        self.hparams['num_epochs'] = conf.num_epochs
        self.hparams['learning_rate'] = conf.learning_rate
        ## Generative objective weight
        self.hparams['alpha'] = conf.alpha
        
        # Internal stuff
        self.seed = conf.seed
        
        # Logging and saving parameters
        self.model_name = conf.model_name
        self.logs_dir = conf.logs_dir
        self.model_type = conf.model_type
        self.model_dir = conf.model_dir
        
        self._build_model()
        
        
        with tf.variable_scope('summary'):
            scalar_summary_tags = ['training_accuracy', 'validation_accuracy']
            
            self.summary_placeholders = {}
            self.summary_ops = {}
            
            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag)
                self.summary_ops[tag]  = tf.scalar_summary(tag, self.summary_placeholders[tag])
                
            self.writer = tf.train.SummaryWriter(os.path.join(self.model_dir, 'logs') , self.sess.graph)
                
        self.merged = tf.merge_all_summaries()
        
        self.saver = tf.train.Saver()
        
        tf.initialize_all_variables().run()
        
        self._load_model_from_checkpoint(checkpoint_fname)            
        
    def _load_model_from_checkpoint(self, checkpoint_fname=None):
        print(" [*] Loading checkpoints...")
        
        if checkpoint_fname is not None:
            self.saver.restore(self.sess, checkpoint_fname)
            return True        
        else:
            ckpt = tf.train.get_checkpoint_state(self.model_dir)    
            if ckpt and ckpt.model_checkpoint_path:
                ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
                fname = os.path.join(self.model_dir, ckpt_name)
                self.saver.restore(self.sess, fname)
                print(" [*] Load SUCCESS: %s" % fname)
                return True
            else:
                print(" [!] Load FAILED: %s" % self.model_dir)
            return False
        
    def save_model_to_checkpoint(self, step):
        print(" [*] Saving checkpoints...")
        model_name = type(self).__name__

        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
            
        return self.saver.save(self.sess, self.model_dir, global_step=step)
    
    
    def _build_model(self, with_init=True):
        self.model = {}
        with tf.variable_scope(self.model_name):
            self.model['X'], self.model['Y'], self.model['learning_rate'] = self._create_placeholders()
            
            m = np.max([self.num_hidden, self.num_classes, self.num_visible])
            
            self.model['U'], self.model['W'], self.model['b'], self.model['c'], self.model['d'] = self._create_matrices(1./np.sqrt(m))
            
            # Defines the internal 
            self._construct_internal_variables()
            
            self.model['d_U'], self.model['d_W'], self.model['d_b'], self.model['d_c'], self.model['d_d'] = self._define_gradients()
            
            self.updates = [self.model['U'].assign_add(self.model['learning_rate'] * self.model['d_U']),
                                self.model['W'].assign_add(self.model['learning_rate'] * self.model['d_W']),
                                self.model['b'].assign_add(self.model['learning_rate'] * self.model['d_b']),
                                self.model['c'].assign_add(self.model['learning_rate'] * self.model['d_c']),
                                self.model['d'].assign_add(self.model['learning_rate'] * self.model['d_d'])]
            
            self.predicted_y = tf.argmax(self.model['p_y_all_given_x'], 1)
            
            self.ground_truth = tf.argmax(self.model['Y'], 1)
            
            self.correct_prediction = tf.equal(self.predicted_y, self.ground_truth, name='correct_prediction')
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32), keep_dims=True, name='accuracy')

    
    def _construct_internal_variables(self):
        # Tensor of shape (num_classes x num_classes)
        self.model['Y_all_classes'] = tf.diag(tf.ones(self.num_classes, 1), name='all_the_classes')
        
        # tensor of shape (num_hidden x num_classes)
        self.model['U_all_y'] = tf.matmul(self.model['U'], self.model['Y_all_classes'])
        
        self.model['WX'] , self.model['O_all'], self.model['positive_part'], self.model['p_y_all_given_x'] = self._give_p_all_y_given_x()
        
        if self.model_type == 'drbm' or self.model_type == 'hybrid':            
            # Calculate p(y|x) for concrete x
            # Result : p_y_given_x (None x 1)
            self.model['p_y_given_x'] = tf.reshape(tf.reduce_sum(tf.mul(self.model['p_y_all_given_x'], self.model['Y']), 1), [-1, 1])
            
            # Training part

            # Calculate UY
            # Result : U (None x num_hidden)
            self.model['UY'] = tf.matmul(self.model['Y'], tf.transpose(self.model['U']), name='UY')
            
            self.model['O'] = self.model['WX'] + self.model['UY']

            # O_sigma: (None x num_hidden)
            self.model['O_sigma'] = tf.sigmoid(self.model['O'])

            # O_sigma_all_Y : (None x num_hidden x num_classes)
            self.model['O_sigma_all_Y'] = tf.sigmoid(self.model['O_all'])

            # O_sigma_all_Y_p : (None x num_hidden x num_classes)        
            self.model['O_sigma_all_Y_p'] = tf.mul(self.model['O_sigma_all_Y'], tf.tile(tf.reshape(self.model['p_y_all_given_x'], [-1, 1, self.num_classes]), [1, self.num_hidden, 1]))                         
            
    
    def _give_p_all_y_given_x(self):
        # Tensor of shape (None x num_hidden)
        WX = tf.matmul(self.model['X'], tf.transpose(self.model['W']), name='WX')    
    
        # WX + c for all batches
        # Result : O (None x num_hidden)
        O = WX + tf.transpose(self.model['c'])
        
        # Resulted O:
        # Result: O (None x num_hidden x num_classes)   
        O = tf.reshape(O, [-1, self.num_hidden, 1]) + tf.reshape(self.model['U_all_y'], [1, self.num_hidden, self.num_classes])

        # First term in log p(y|x) which is calculated for each x in the batch
        # Result : first_term (1 x num_classes)
        first_term = tf.matmul(tf.transpose(self.model['d']), self.model['Y_all_classes'])       
        
        # Second term in log p(y|x) which is calculated for each x in the batch
        # Result : second_term (None x num_classes)            
        second_term = tf.reduce_sum(tf.nn.softplus(O), 1)        
    
        # Positive part of log p(y|x)  
        # Result: positive_part (None x num_classes)
        positive_part = first_term + second_term
        
        # Use the softmax to calculate the probabilities:
        # Result: p_y_all_given_x (None x num_classes)
        p_y_all_given_x = tf.nn.softmax(positive_part)
    
        return WX, O, positive_part, p_y_all_given_x   
        
    def _define_gradients(self):
        d_U = tf.zeros([self.num_hidden, self.num_classes], dtype=tf.float32, name='d_U')
        d_W = tf.zeros([self.num_hidden, self.num_visible], dtype=tf.float32, name='d_W')
        d_b = tf.zeros([self.num_visible, 1], dtype=tf.float32, name='d_b')
        d_c = tf.zeros([self.num_hidden, 1], dtype=tf.float32, name='d_c')
        d_d = tf.zeros([self.num_classes, 1], dtype=tf.float32, name='d_d')         
        
        if self.model_type == 'grbm' or self.model_type == 'hybrid':
            # Generative gradients
            d_U_gen, d_W_gen, d_b_gen, d_c_gen, d_d_gen = self._calc_generative_grads(self.model['Y'], self.model['X'])
            if self.model_type == 'grbm':                    
                d_U = d_U_gen
                d_W = d_W_gen
                d_b = d_b_gen
                d_c = d_c_gen
                d_d = d_d_gen
            elif self.model_type == 'hybrid':
                d_U = d_U + self.hparams['alpha']* d_U_gen
                d_W = d_W + self.hparams['alpha'] * d_W_gen
                d_b = d_b + self.hparams['alpha'] * d_b_gen
                d_c = d_c + self.hparams['alpha'] * d_c_gen
                d_d = d_d + self.hparams['alpha'] * d_d_gen
                
        
        if self.model_type == 'drbm' or self.model_type == 'hybrid':
            # Discriminative gradients        
            # # d_U: (num_hidden x num_classes)
            dU_left = tf.matmul(tf.transpose(self.model['O_sigma']), self.model['Y'])
            dU_right = tf.matmul(tf.transpose(tf.reduce_sum(self.model['O_sigma_all_Y_p'], 2)), self.model['Y'])
            d_U_disc = tf.div(dU_left - dU_right, self.hparams['batch_size'])
            d_U_disc = tf.reshape(d_U_disc, [self.num_hidden, self.num_classes])
            
            # d_W : (num_hidden x num_visible)
            dW_left = tf.matmul(tf.transpose(self.model['O_sigma']), self.model['X'])
            dW_right = tf.matmul(tf.transpose(tf.reduce_sum(self.model['O_sigma_all_Y_p'], 2)), self.model['X']) 
        
            d_W_disc = tf.div(dW_left - dW_right, self.hparams['batch_size'])
            d_W_disc = tf.reshape(d_W_disc, [self.num_hidden, self.num_visible])

            # d_c : (num_hidden x 1)
            dc_left = tf.reduce_sum(self.model['O_sigma'], 0)
            dc_right = tf.reduce_sum(tf.reduce_sum(self.model['O_sigma_all_Y_p'], 2), 0)
            d_c_disc = tf.div(dc_left - dc_right, self.hparams['batch_size'])
            d_c_disc = tf.reshape(d_c_disc, [self.num_hidden, 1])

            # d_d : (num_classes x 1)
            d_d_disc = tf.div(tf.reduce_sum(self.model['Y'] - self.model['p_y_all_given_x'], 0), self.hparams['batch_size'])
            d_d_disc = tf.reshape(d_d_disc, [self.num_classes, 1])
            
            d_U = d_U + d_U_disc
            d_W = d_W + d_W_disc
            d_c = d_c + d_c_disc
            d_d = d_d + d_d_disc
        
        return d_U, d_W, d_b, d_c, d_d
        
                 

    def _calc_generative_grads(self, y, x):
        y0, x0, h0, y1, x1, h1 = self._gibbs_sampling_step(y, x)
                
        h0 = tf.reshape(h0, [-1, self.num_hidden, 1])
        y0 = tf.reshape(y0, [-1, self.num_classes, 1])
        x0 = tf.reshape(x0, [-1, self.num_visible, 1])
        h1 = tf.reshape(h1, [-1, self.num_hidden, 1])
        y1 = tf.reshape(y1, [-1, self.num_classes, 1])
        x1 = tf.reshape(x1, [-1, self.num_visible, 1])
        
        d_U_gen = tf.reduce_mean(tf.batch_matmul(h0, y0, adj_y = True) - tf.batch_matmul(h1, y1, adj_y = True), 0)
        d_W_gen = tf.reduce_mean(tf.batch_matmul(h0, x0, adj_y = True) - tf.batch_matmul(h1, x1, adj_y = True), 0)
        
        d_b_gen = tf.reduce_sum(x0 - x1, 0)
        d_c_gen = tf.reduce_sum(h0 - h1, 0)
        d_d_gen = tf.reduce_sum(y0 - y1, 0)        
        
        return d_U_gen, d_W_gen, d_b_gen, d_c_gen, d_d_gen
        
    
    def _gibbs_sampling_step(self, y, x):
        # Positive phase
        y0 = y
        x0 = x
        h0 = tf.nn.sigmoid(tf.transpose(self.model['c'] + tf.matmul(self.model['W'], tf.transpose(x0)) + tf.matmul(self.model['U'], tf.transpose(y0))))
    
        # Negative phase
        h0new = self._sample_h(h0)    
        y1 = self._sample_y(h0new)
        x1 = self._sample_x(h0new)
        h1 = tf.nn.sigmoid(tf.transpose(self.model['c'] + tf.matmul(self.model['W'], tf.transpose(x1)) + tf.matmul(self.model['U'], tf.transpose(y1))))
    
        return y0, x0, h0, y1, x1, h1
    
    def _sample_prob(self, probs, size):
        rand = tf.random_uniform([self.hparams['batch_size'], size], minval=0.0, maxval=1.0, dtype=tf.float32)        
        return tf.cast(rand < probs, tf.float32)

    def _sample_h(self, h_prob):            
        return self._sample_prob(h_prob, self.num_hidden)
    
    def _sample_y(self, h):
        yprob = tf.nn.softmax(tf.transpose(self.model['d'] + tf.matmul(tf.transpose(self.model['U']), tf.transpose(h))), dim=-1)
        squeezed_y = tf.squeeze(tf.one_hot(tf.multinomial(yprob,1), self.num_classes), [1])    
        return tf.matmul(squeezed_y, self.model['Y_all_classes'])    

    def _sample_x(self, h):
        xprob = tf.nn.sigmoid(tf.transpose(self.model['b'] + tf.matmul(tf.transpose(self.model['W']), tf.transpose(h))))
        return self._sample_prob(xprob, self.num_visible)
    
    def _create_placeholders(self):
        X = tf.placeholder(tf.float32, [None, self.num_visible])
        
        Y = tf.placeholder(tf.float32, [None, self.num_classes])
        
        learning_rate = tf.placeholder(tf.float32)
        
        return X, Y, learning_rate
    
    def _create_matrices(self, m_sqrt):
        U = tf.get_variable('U', [self.num_hidden, self.num_classes], tf.float32, 
                           tf.random_uniform_initializer(minval=-m_sqrt, maxval=m_sqrt, seed=self.seed, dtype=tf.float32), None)
        
        W = tf.get_variable('W', [self.num_hidden, self.num_visible], tf.float32, 
                           tf.random_uniform_initializer(minval=-m_sqrt, maxval=m_sqrt, seed=self.seed, dtype=tf.float32), None)
        
        b = tf.get_variable('b', [self.num_visible, 1], tf.float32,
                           tf.zeros_initializer, None)
        
        c = tf.get_variable('c', [self.num_hidden, 1], tf.float32,
                           tf.zeros_initializer, None)
        
        d = tf.get_variable('d', [self.num_classes, 1], tf.float32,
                           tf.zeros_initializer, None)  
        
        return U, W, b, c, d
    
    def inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], {
                self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
            })
        
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, step)               
                
            
    def _get_timestamp(self):
        now = datetime.datetime.now(dateutil.tz.tzlocal())
        return now.strftime('%Y_%m_%d_%H_%M_%S')
    

    def train(self, sess, data, with_update=False, debug_learning_rate=None):
        num_batches = np.shape(data.images)[0] / self.hparams['batch_size']
        
        
        accuracies = np.zeros(num_batches)
        
        for i in range(num_batches):
            x, y = data.next_batch(self.hparams['batch_size'])
            
            if with_update==True:
                if debug_learning_rate is not None:                    
                    _, accuracy = self.sess.run([self.updates, self.accuracy], feed_dict={self.model['X'] : self._binarise(x),
                                                                                          self.model['Y'] : y,
                                                                                          self.model['learning_rate'] : debug_learning_rate})
                else:
                    _, accuracy = self.sess.run([self.updates, self.accuracy], feed_dict={self.model['X'] : self._binarise(x),
                                                                                          self.model['Y'] : y,
                                                                                          self.model['learning_rate'] : model.hparams['learning_rate']})
                    
            else:
                accuracy = self.sess.run([self.accuracy], feed_dict={self.model['X'] : self._binarise(x),
                                                                     self.model['Y'] : y})
                
            accuracies[i] = accuracy[0]
        
        return np.mean(accuracies)
    
    def test(self, sess, data):
        accuracy =  self.sess.run([self.accuracy], feed_dict={self.model['X'] : self._binarise(data.images), self.model['Y'] : data.labels})
        return accuracy[0]
                                                 
    
    
    def _binarise(self, images):
        return (images > 0).astype('float32')

In [3]:
flags = tf.app.flags

# Model input parameters
flags.DEFINE_integer("num_hidden", 6002, "number of hidden units")
flags.DEFINE_integer("num_visible", 28 * 28, "number of visible units")
flags.DEFINE_integer("num_classes", 10, "number of classes")
        
# Learning hyperparameters
flags.DEFINE_integer("batch_size", 1, "batch size")
flags.DEFINE_integer("num_epochs", 70, "number of epochs")
flags.DEFINE_float("learning_rate", 0.005, "learning rate")
flags.DEFINE_float("alpha", 0.01, "generative objective weight")

# Debug
flags.DEFINE_string("model_name", "my_model", "name of the model")
flags.DEFINE_string("model_type", "grbm", "type of the model : [drbm, grbm, hybrid]")
flags.DEFINE_string("model_dir", "./debug/models/", "directory of saved checkpoints")
flags.DEFINE_string("logs_dir", "./debug/logs/", "directory to save the logs")
flags.DEFINE_integer("seed", 123, "random seed for python")

conf = flags.FLAGS

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST', one_hot=True)

next_train_batch = lambda x: mnist.train.next_batch(x)[0]
next_validation_batch = lambda x: mnist.validation.next_batch(x)[0]
next_test_batch = lambda x: mnist.test.next_batch(x)[0]

def calc_gpu_fraction(fraction_string):
    idx, num = fraction_string.split('/')
    idx, num = float(idx), float(num)
    fraction = 1 / (num - idx + 1)
    print " [*] GPU : %.4f" % fraction
    
    return fraction
gpu_options = tf.GPUOptions(
      per_process_gpu_memory_fraction=1.0)

Extracting MNIST/train-images-idx3-ubyte.gz
Extracting MNIST/train-labels-idx1-ubyte.gz
Extracting MNIST/t10k-images-idx3-ubyte.gz
Extracting MNIST/t10k-labels-idx1-ubyte.gz


In [None]:
debug_results = 'debug_results.csv'

# learning_rates = [0.05, 0.01, 0.005, 0.001, 0.0005]
# num_hiddens = [100, 200, 600, 1000, 6000]
learning_rates = [0.005]
num_hiddens = [6002]
i  = 0

for lr in learning_rates:
    j = 0
    for nh in num_hiddens:
        conf.learning_rate = lr
        conf.num_hidden = nh
        conf.model_name = str(conf.model_type) + '_lr_' + str(lr) + '_nh_' + str(nh)
        my_scope = str(conf.model_type) + str(i) + '_' + str(j)
        conf.model_dir = './debug/models/'+conf.model_name + '/'

        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess:
            with tf.variable_scope(my_scope) as scope:
                with tf.device('/device:GPU:0'):
                    model = Model(sess, conf)
            
                    epoch_train_accuracy = np.zeros(conf.num_epochs)
                    epoch_validation_accuracy = np.zeros(conf.num_epochs)
                    epoch_test_accuracy = np.zeros(conf.num_epochs)
                    
                    # start_epoch = 16
            
                    for epoch in range(conf.num_epochs):
                        print 'Epoch ', epoch, ' is starting...'
                        model._load_model_from_checkpoint()
                
                        print 'Training...'
                        avg_train_acc = model.train(sess, mnist.train, with_update=True)
                        print 'Average training accuracy : ', avg_train_acc
                        epoch_train_accuracy[epoch] = avg_train_acc
                
                        model.inject_summary({'training_accuracy' : avg_train_acc}, epoch)
                
                        print 'Validation...'
                        avg_val_acc = model.train(sess, mnist.validation, with_update=False)
                        print 'Average validation accuracy : ', avg_val_acc
                        epoch_validation_accuracy[epoch] = avg_val_acc
                
                        model.inject_summary({'validation_accuracy' : avg_val_acc}, epoch)
                
                        stop_training = False
                
                        if epoch > 5:
                            stop_training = True
                            for k in range(5):
                                if epoch_validation_accuracy[epoch - k]  <= epoch_validation_accuracy[epoch - (k+1)]:
                                    stop_training = False
                
                        print 'Testing...'
                        test_acc = model.test(sess, mnist.test)
                        print 'Testing accuracy : ', test_acc
                        epoch_test_accuracy[epoch] = test_acc
                
                        print "Saving checkpoints..."
                        save_path = model.save_model_to_checkpoint(epoch)                
                        print "Checkpoint succesfully saved..."
                
                        with open(debug_results, 'a') as results_file:
                            writer = csv.writer(results_file, delimiter=',')
                            writer.writerow([conf.model_name, conf.model_type, epoch, conf.num_hidden,
                                         conf.learning_rate,
                                         avg_train_acc,
                                         avg_val_acc,
                                         test_acc, stop_training, save_path])                
                                
                

 [*] Loading checkpoints...
 [!] Load FAILED: ./debug/models/grbm_lr_0.005_nh_6002/
Epoch  0  is starting...
 [*] Loading checkpoints...
 [!] Load FAILED: ./debug/models/grbm_lr_0.005_nh_6002/
Training...
