In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

from scipy.io import loadmat
import os
import random
from scipy.misc import imread
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
def conv(x, filter_size, num_filters, stride, name, padding='SAME', groups=1, trainable=True):
    input_channels = int(x.get_shape()[-1])

    # Create lambda function for the convolution
    convolve = lambda x, W: tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding=padding)

    with tf.variable_scope(name):
        # Create tf variables for the weights and biases of the conv layer
        weights = tf.get_variable('W',
                                  shape=[filter_size, filter_size, input_channels // groups, num_filters],
                                  initializer=tf.contrib.layers.xavier_initializer(),
                                  trainable=trainable)
        biases = tf.get_variable('b', shape=[num_filters], trainable=trainable, initializer=tf.zeros_initializer())

        if groups == 1:
            conv = convolve(x, weights)

        else:
            # Split input and weights and convolve them separately
            input_groups = tf.split(x, groups, axis=3)
            weight_groups = tf.split(weights, groups, axis=3)
            output_groups = [convolve(i, k) for i, k in zip(input_groups, weight_groups)]

            # Concat the convolved output together again
            conv = tf.concat(output_groups, axis=3)

        return tf.nn.relu(conv + biases)

def deconv_layer(x, filter_size, num_filters, stride, name, padding='SAME', relu=True):
    activation = None
    if relu:
        activation = tf.nn.relu
    return tf.layers.conv2d_transpose(x, num_filters, filter_size, stride, padding=padding, kernel_initializer=tf.contrib.layers.xavier_initializer(), activation=activation, name=name)
    
def fc(x, num_out, name, relu=True, trainable=True):
    num_in = int(x.get_shape()[-1])
    with tf.variable_scope(name):
        weights = tf.get_variable('W', shape=[num_in, num_out], initializer=tf.contrib.layers.xavier_initializer(), trainable=trainable)
        biases = tf.get_variable('b', [num_out], initializer=tf.zeros_initializer(), trainable=trainable)
        x = tf.matmul(x, weights) + biases
        if relu:
            x = tf.nn.relu(x) 
    return x

def lrn(x, radius, alpha, beta, name, bias=1.0):
    return tf.nn.local_response_normalization(x, depth_radius=radius, alpha=alpha, beta=beta, bias=bias, name=name)

def max_pool(x, filter_size, stride, name=None, padding='SAME'):
    return tf.nn.max_pool(x, ksize=[1, filter_size, filter_size, 1], strides=[1, stride, stride, 1], padding=padding, name=name)

def dropout(x, keep_prob):
    return tf.nn.dropout(x, keep_prob)

def vgg(input, process_input=True):
    if process_input:
        VGG_MEAN = [103.939, 116.779, 123.68]
        
        # Convert RGB to BGR and subtract mean
        red, green, blue = tf.split(input, 3, axis=3)
        input = tf.concat([
            blue - VGG_MEAN[0],
            green - VGG_MEAN[1],
            red - VGG_MEAN[2],
        ], axis=3)
        
    pool_ = lambda x: max_pool(x, 2, 2)
    conv_ = lambda x, output_depth, name: conv(x, 3, output_depth, 1, name=name)
    
    conv_1_1 = conv_(input, 64, 'conv1_1')
    conv_1_2 = conv_(conv_1_1, 64, 'conv1_2')

    pool_1 = pool_(conv_1_2)

    conv_2_1 = conv_(pool_1, 128, 'conv2_1')
    conv_2_2 = conv_(conv_2_1, 128, 'conv2_2')

    pool_2 = pool_(conv_2_2)

    conv_3_1 = conv_(pool_2, 256, 'conv3_1')
    conv_3_2 = conv_(conv_3_1, 256, 'conv3_2')
    conv_3_3 = conv_(conv_3_2, 256, 'conv3_3')

    pool_3 = pool_(conv_3_3)

    conv_4_1 = conv_(pool_3, 512, 'conv4_1')
    conv_4_2 = conv_(conv_4_1, 512, 'conv4_2')
    conv_4_3 = conv_(conv_4_2, 512, 'conv4_3')

    pool_4 = pool_(conv_4_3)

    conv_5_1 = conv_(pool_4, 512, 'conv5_1')
    conv_5_2 = conv_(conv_5_1, 512, 'conv5_2')
    conv_5_3 = conv_(conv_5_2, 512, 'conv5_3')

    pool_5 = pool_(conv_5_3)
    flattened = tf.contrib.layers.flatten(pool_5)

    fc_6 = dropout(fc(flattened, 4096, 'fc6'), 0.5)
    fc_7 = fc(fc_6, 4096, 'fc7', relu=False)
    return fc_7

def vgg_simple(input):
    pool_ = lambda x: max_pool(x, 2, 2)
    conv_ = lambda x, output_depth, name: conv(x, 3, output_depth, 1, name=name)
    
    conv_1_1 = conv_(input, 16, 'conv1_1')
    pool_1 = pool_(conv_1_1)

    conv_2_1 = conv_(pool_1, 32, 'conv2_1')
    pool_2 = pool_(conv_2_1)

    conv_3_1 = conv_(pool_2, 64, 'conv3_1')
    pool_3 = pool_(conv_3_1)

    conv_4_1 = conv_(pool_3, 64, 'conv4_1')
    pool_4 = pool_(conv_4_1)

    conv_5_1 = conv_(pool_4, 64, 'conv5_1')
    pool_5 = pool_(conv_5_1)
    
    flattened = tf.contrib.layers.flatten(pool_5)
    fc_6 = dropout(fc(flattened, 4096, 'fc6'), 0.5)
    fc_7 = fc(fc_6, 4096, 'fc7', relu=False)
    return fc_7

In [3]:
class Generator(object):
    def __init__(self):
        self.train_variables = []
        self.has_defined_layers = False
        self.has_defined_C1 = False
    
    def init_network(self, discriminator):
        self.p_t_n = tf.placeholder(tf.float32, [None, 224, 224,L])
        self.p_t = tf.placeholder(tf.float32, [None, 224, 224,L])
        self.x_t = tf.placeholder(tf.float32, [None, 224, 224, 3])
        self.x_t_n = tf.placeholder(tf.float32, [None, 224, 224, 3])
        x_t_n_predicted = self.get_output_tensor(self.p_t_n, self.p_t, self.x_t)
        mean_l2 = lambda x, y: tf.reduce_mean(tf.squared_difference(x, y))
        l2_loss = mean_l2(self.x_t_n, x_t_n_predicted)
        feat_loss = mean_l2(self.C1(self.x_t), self.C1(x_t_n_predicted))
        adv_loss = -tf.reduce_mean(tf.log(discriminator.get_output_tensor(x_t_n_predicted, self.p_t_n)))
        self.loss = 100*l2_loss + 100*feat_loss + 0.05 * adv_loss
        self.opt = tf.train.GradientDescentOptimizer(learning_rate=1e-5).minimize(self.loss, var_list=self.train_variables)
        with tf.name_scope('generator'):
            l2_loss_summ = tf.summary.scalar('l2_loss', l2_loss)
            feature_loss_summ = tf.summary.scalar('feature_loss', feat_loss)
            adversarial_loss_summ = tf.summary.scalar('adversarial_loss', adv_loss)
            loss_summ = tf.summary.scalar('loss', self.loss)
            self.summaries = tf.summary.merge([l2_loss_summ, feature_loss_summ, adversarial_loss_summ, loss_summ])
        
    def get_output_tensor(self, p_t_n, p_t, x_t):
        with tf.variable_scope('generator', reuse=self.has_defined_layers):
            p_t_n_latent = self.f_pose(p_t_n)
            latent = p_t_n_latent - self.f_pose(p_t, force_reuse=True) + self.f_img(x_t)
            output = self.f_dec(latent)
        if not self.has_defined_layers:
            self.train_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator')
        self.has_defined_layers = True
        return output
        
    def f_pose(self, input, force_reuse=False):
        '''
        Applies f_pose function to the input tensor to get an output. Should be similar to VGG architecture
        '''
        with tf.variable_scope('f_pose', reuse=(self.has_defined_layers or force_reuse)):
            return vgg_simple(input)
        
    def f_img(self, input):
        '''
        Applies f_img function to the input tensor to get an output. Should be exactly VGG architecture
        '''
        with tf.variable_scope('f_img', reuse=self.has_defined_layers):
            return vgg(input, process_input=True)
        
    def f_dec(self, input):
        '''
        Applies f_dec function to the input tensor to get an output.
        '''
        with tf.variable_scope('f_dec', reuse=self.has_defined_layers):
            reshaped = tf.reshape(input, shape=[tf.shape(input)[0], 1, 1, 4096])
            deconv_6_2 = deconv_layer(reshaped, 7, 128, 1, 'deconv6_2', padding='VALID')
            deconv_6_1 = deconv_layer(deconv_6_2, 3, 128, 2, 'deconv6_1')

            deconv_5_2 = deconv_layer(deconv_6_1, 3, 128, 1, 'deconv5_2')
            deconv_5_1 = deconv_layer(deconv_5_2, 3, 128, 2, 'deconv5_1')

            deconv_4_3 = deconv_layer(deconv_5_1, 3, 128, 1, 'deconv4_3')
            deconv_4_2 = deconv_layer(deconv_4_3, 3, 128, 1, 'deconv4_2')
            deconv_4_1 = deconv_layer(deconv_4_2, 3, 64, 2, 'deconv4_1')

            deconv_3_3 = deconv_layer(deconv_4_1, 3, 64, 1, 'deconv3_3')
            deconv_3_2 = deconv_layer(deconv_3_3, 3, 64, 1, 'deconv3_2')
            deconv_3_1 = deconv_layer(deconv_3_2, 3, 32, 2, 'deconv3_1')

            deconv_2_2 = deconv_layer(deconv_3_1, 3, 32, 1, 'deconv2_2')
            deconv_2_1 = deconv_layer(deconv_2_2, 3, 16, 2, 'deconv2_1')

            deconv_1_2 = deconv_layer(deconv_2_1, 3, 16, 1, 'deconv1_2')
            deconv_1_1 = deconv_layer(deconv_1_2, 3, 3, 1, 'deconv1_1')
        
        return deconv_1_1
        
    def C1(self, input):
        input = tf.image.resize_images(input, [227, 227])
        with tf.variable_scope('C1', reuse=self.has_defined_C1):
            conv1 = conv(input, 11, 96, 4, padding='VALID', name='conv1', trainable=False)
            pool1 = max_pool(conv1, 3, 2, padding='VALID', name='pool1')
            norm1 = lrn(pool1, 2, 2e-5, 0.75, name='norm1')

            conv2 = conv(norm1, 5, 256, 1, groups=2, name='conv2', trainable=False)
            pool2 = max_pool(conv2, 3, 2, padding='VALID', name='pool2')
            norm2 = lrn(pool2, 2, 2e-5, 0.75, name='norm2')

            conv3 = conv(norm2, 3, 384, 1, name='conv3', trainable=False)
            conv4 = conv(conv3, 3, 384, 1, groups=2, name='conv4', trainable=False)
            conv5 = conv(conv4, 3, 256, 1, groups=2, name='conv5', trainable=False)
        self.has_defined_C1 = True
        return conv5
    
    def init_weights(self, sess, alexnet_file, vgg_file):
        weights_dict = np.load(alexnet_file, encoding='bytes').item()
        with tf.variable_scope('C1', reuse=True):
            for layer in ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']:
                with tf.variable_scope(layer):
                    W_value, b_value = weights_dict[layer]
                    W = tf.get_variable('W', trainable=False)
                    b = tf.get_variable('b', trainable=False)
                    sess.run(W.assign(W_value))
                    sess.run(b.assign(b_value))
        weights_dict = np.load(vgg_file, encoding='bytes').item()
        weights_dict = { key.decode('ascii') : value for key, value in weights_dict.items() }
        with tf.variable_scope('generator/f_img', reuse=True):
            for layer in ['conv1_1', 'conv1_2',
                          'conv2_1', 'conv2_2',
                          'conv3_1', 'conv3_2', 'conv3_3',
                          'conv4_1', 'conv4_2', 'conv4_3',
                          'conv5_1', 'conv5_2', 'conv5_3',
                          'fc6', 'fc7']:
                with tf.variable_scope(layer):
                    W_value, b_value = weights_dict[layer]
                    W = tf.get_variable('W')
                    b = tf.get_variable('b')
                    sess.run(W.assign(W_value))
                    sess.run(b.assign(b_value))
    
    def fit_batch(self,sess, p_t, p_t_n, x_t, x_t_n):
        _, loss, summaries = sess.run((self.opt, self.loss, self.summaries), feed_dict={ self.p_t : p_t, self.p_t_n : p_t_n, self.x_t : x_t, self.x_t_n : x_t_n })
        return loss, summaries

class Discriminator(object):
    def __init__(self):
        self.train_variables = []
        self.has_defined_layers = False
    
    def init_network(self, discriminator):
        self.p_t = tf.placeholder(tf.float32, [None, 224, 224,L])
        self.p_t_n = tf.placeholder(tf.float32, [None, 224, 224,L])
        self.x_t = tf.placeholder(tf.float32, [None, 224, 224, 3])
        self.x_t_n = tf.placeholder(tf.float32, shape=[None, 224, 224, 3])
        x_t_n_real = self.x_t_n
        x_t_n_pred = generator.get_output_tensor(self.p_t_n, self.p_t, self.x_t)

        real_prob = self.get_output_tensor(x_t_n_real, self.p_t_n)
        fake_prob = self.get_output_tensor(x_t_n_pred, self.p_t_n)
        real_mismatch_prob = self.get_output_tensor(self.x_t, self.p_t_n)
        
        real_loss = -tf.reduce_mean(tf.log(real_prob))
        fake_loss = -tf.reduce_mean(tf.log(1 - fake_prob))
        mismatch_loss = -tf.reduce_mean(tf.log(1 - real_mismatch_prob))
        self.loss = real_loss + 0.5 * fake_loss + 0.5 * mismatch_loss
        self.opt = tf.train.GradientDescentOptimizer(learning_rate=1e-5).minimize(self.loss, var_list=self.train_variables)
        with tf.name_scope('discriminator'):
            real_loss_summ = tf.summary.scalar('real_loss', real_loss)
            fake_loss_summ = tf.summary.scalar('fake_loss', fake_loss)
            mismatch_loss_summ = tf.summary.scalar('mismatch_loss', mismatch_loss)
            loss_summ = tf.summary.scalar('loss', self.loss)
            self.summaries = tf.summary.merge([real_loss_summ, fake_loss_summ, mismatch_loss_summ, loss_summ])
            
    def get_output_tensor(self, x, p):
        with tf.variable_scope('discriminator', reuse=self.has_defined_layers):
            with tf.variable_scope('f_img'):
                vgg_x = vgg(x)
            with tf.variable_scope('f_pose'):
                vgg_p = vgg_simple(p)
            concat = tf.concat([vgg_x, vgg_p], axis=1)
            fc8 = fc(concat, 1024, name='fc8')
            output = tf.nn.sigmoid(fc(fc8, 1, name='fc9', relu=False))
        if not self.has_defined_layers:
            self.train_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator')
        self.has_defined_layers = True
        return output
    
    def init_weights(self, sess, alexnet_file, vgg_file):
        weights_dict = np.load(vgg_file, encoding='bytes').item()
        weights_dict = { key.decode('ascii') : value for key, value in weights_dict.items() }
        with tf.variable_scope('discriminator/f_img', reuse=True):
            for layer in ['conv1_1', 'conv1_2',
                          'conv2_1', 'conv2_2',
                          'conv3_1', 'conv3_2', 'conv3_3',
                          'conv4_1', 'conv4_2', 'conv4_3',
                          'conv5_1', 'conv5_2', 'conv5_3',
                          'fc6', 'fc7']:
                with tf.variable_scope(layer):
                    W_value, b_value = weights_dict[layer]
                    W = tf.get_variable('W')
                    b = tf.get_variable('b')
                    sess.run(W.assign(W_value))
                    sess.run(b.assign(b_value))
    
    def fit_batch(self, sess, p_t, p_t_n, x_t, x_t_n):
        _, loss, summaries = sess.run((self.opt, self.loss, self.summaries), feed_dict={ self.p_t : p_t, self.p_t_n : p_t_n, self.x_t : x_t, self.x_t_n : x_t_n })
        return loss, summaries

In [4]:
# tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='')

In [5]:
def restore(generator, discriminator, checkpoint):
    var_list = generator.train_variables + discriminator.train_variables
    
#     saver = tf.train.Saver(var_list={var.name.split(':')[0].replace(, 'alexnet') : var for var in var_list})
    saver.restore(sess, checkpoint)

In [6]:
L = 13

In [7]:
tf.reset_default_graph()
sess = tf.Session()
summary_writer = tf.summary.FileWriter('summaries/', graph=sess.graph)

generator = Generator()
discriminator = Discriminator()

generator.init_network(discriminator)
discriminator.init_network(generator)

sess.run(tf.global_variables_initializer())

generator.init_weights(sess, 'models/alexnet.npy', 'models/vgg16.npy')
discriminator.init_weights(sess, 'models/alexnet.npy', 'models/vgg16.npy')

In [8]:
# Create dictionary of squat videos. 
# Key = video number, Value = list of frames (numpy array images) of video
videos = {}
for video in os.listdir('squats/'):
    videos[video] = []
    for frame in os.listdir('squats/' + str(video) + '/'):
        filename = 'squats/' + str(video) + '/' + str(frame)
        videos[video].append(imread(filename))

# Create dictionary of heatmat labels for squat videos. 
# For L = 13
# Key = video number, Value = list of stack of joints (numpy array images (224x224x13))
if L == 13:
    labels = {}
    for video in os.listdir('squats_labels_multiple/'):
        if video in videos:
            labels[video] = []
            for frame in os.listdir('squats_labels_multiple/' + str(video) + '/'):
                frame_folder = 'squats_labels_multiple/' + str(video) + '/' + str(frame) + '/'
                temp_image_stack = np.zeros((224,224,13))
                i = 0
                for filename in os.listdir(frame_folder):
                    temp_image_stack[:,:,i] = imread(frame_folder + filename)
                    i = i + 1
                labels[video].append(temp_image_stack)

# For L = 1 
# Key = video number, Value = list of heatmaps for each frame (numpy array images (224x224x1))            
elif L == 1:
    labels = {}
    for video in os.listdir('squats_labels/'):
        if video in videos:
            labels[video] = []
            for frame in os.listdir('squats_labels/' + str(video) + '/'):
                filename = 'squats_labels/' + str(video) + '/' + str(frame)
                labels[video].append(imread(filename).reshape((224,224,1)))

In [None]:
def create_minibatch(batch_size):
    frames1 = []
    heatmaps1 = []
    frames2 = []
    heatmaps2 = []
    for i in range(batch_size):
        rand_video = videos.keys()[random.randint(0,len(videos.keys())-1)]
        
        rand_int = random.randint(0,len(videos[rand_video])-1)
        frames1.append(videos[rand_video][rand_int])
        heatmaps1.append(labels[rand_video][rand_int])

        rand_int = random.randint(0,len(videos[rand_video])-1)
        frames2.append(videos[rand_video][rand_int])
        heatmaps2.append(labels[rand_video][rand_int])
    return frames1, frames2, heatmaps1, heatmaps2


In [None]:
epochs = 100
n_samples = 1000
batch_size = 2
summary_freq_iter = 10
display_step = 1

mean_gen_losses = []
mean_disc_losses = []
for epoch in range(epochs):
    total_iter = n_samples // batch_size
    total_gen_loss = 0
    total_disc_loss = 0
    for i in range(total_iter):
        f1,f2,h1,h2 = create_minibatch(batch_size)
        gen_loss, gen_summaries = generator.fit_batch(sess,h1,h2,f1,f2)
        disc_loss, disc_summaries = discriminator.fit_batch(sess,h1,h2,f1,f2)
        total_gen_loss += gen_loss
        total_disc_loss += disc_loss
        if i % summary_freq_iter == 0:
            step = epoch * n_samples + (i + 1) * batch_size
            summary_writer.add_summary(gen_summaries, step)
            summary_writer.add_summary(disc_summaries, step)
    mean_gen_loss = total_gen_loss / total_iter
    mean_disc_loss = total_disc_loss / total_iter
    mean_gen_losses.append(mean_gen_loss)
    mean_disc_losses.append(mean_disc_loss)
    if (epoch + 1) % display_step == 0:
        print('epoch %s: gen_loss=%.4f, disc_loss=%.4f' % (epoch + 1, mean_gen_loss, mean_disc_loss))

saver = tf.train.Saver()
saver.save(sess, '/media/jeffzhang/WD HDD/model/multi-labels-test3-gradient-opt-100-100-005',global_step=100)

epoch 1: gen_loss=inf, disc_loss=1.5596
epoch 2: gen_loss=1358344.4966, disc_loss=1.4132
epoch 3: gen_loss=1378886.7206, disc_loss=1.3245
