take two of nst


In [1]:
import os
import tensorflow as tf
import cv2
import numpy as np
import time
import scipy.io
import errno


  from ._conv import register_converters as _register_converters


In [2]:
# declare arguments into a dictionary
inputs = {
    'content_image' : os.path.join('.\image_input', 'not_id.jpg'),
    # string - path to content image
    'content_weight' : 100,
    # float - alpha for total loss function
    'content_layers' : ['conv4_2'],
    # list of strings - list of layers used for content loss
    'content_layers_weights' : [1.0],
    # list of floats - content layers weights for content loss
    'content_loss_function' : 1,
    # integer - 1 or 2 or 3
    'style_images' : [os.path.join('.\styles', '1.jpg')] ,
    # list of string - list of pathes to style images
    'style_images_weights' : [1.0],
    # list of floats - list of style images' weights
    'style_weight' : 10000,
    # float - beta for total loss function
    'style_layers' : ['relu1_1', 'relu2_1', 'relu3_1', 'relu4_1', 'relu5_1'],
    # list of strings - layers used for style loss
    'style_layers_weights' : [0.2, 0.2, 0.2, 0.2, 0.2],
    # list of floats - list of style layers weights
    'total_variation_weight' : .001,
    # float - total variation weight for the total loss function
    'image_max_size' : 700,
    # int - maximum dimension (width or height) of images in question
    'verbose' : True,
    # boolean - print a lot or not
    'generated_image_initialization' : 'random',
    # string from 'content', 'style', 'random'
    'noise_ratio' : 0.3, 
    # float from 0.0 to 1.0 - interpolation ratio from content image to a full noise image 
    'model_weights' : 'imagenet-vgg-verydeep-19.mat',
    # string - path to pre-trained model weights (.mat file)
    'pooling_type' : 'avg',
    # string either 'avg' or 'max'
    'device' : '/gpu:0',
    # string - device used to run tensorflow '/cpu:0' or '/gpu:0' or else
    'style_mask' : False, 
    # boolean - use mask 
    'style_masks_images' : [os.path.join('.\image_input', 'face_mask_inv.png')], 
    # list of strings - list of paths to style mask images
    # !NB number of style_mask_images should be equal to number of styles
    'original_colors' : False,
    # boolean - use original content colors
    'color_convert_type' : 'yuv',
    # string - one of ['yuv', 'ycrcb', 'luv', 'lab']
    'optimizer_function' : 'adam',
    # string - either lfbgs or adam
    'learning_rate' : 3.0,
    # float - learning rate for optimization
    'max_iterations' : 500,
    # integer - number of max iterations done in image generation
}

def normalize(weights):
    denom = sum(weights)
    if denom > 0.:
        return [float(i) / denom for i in weights]
    else: return [0.] * len(weights)
    
inputs['style_layers_weights'] = normalize(inputs['style_layers_weights'])
inputs['content_layers_weights'] = normalize(inputs['content_layers_weights'])
inputs['style_images_weights'] = normalize(inputs['style_images_weights'])

def check_image(img):
    if cv2.imread(img, cv2.IMREAD_GRAYSCALE) is None:
        raise OSError(errno.ENOENT, "No such file", img)
        
check_image(inputs['content_image'])
for style_image in inputs['style_images']:
    check_image(style_image)
for style_mask_image in inputs['style_masks_images']:
    check_image(style_mask_image)

In [3]:
def get_content_image(content_image, image_max_size):
    img = cv2.imread(content_image, cv2.IMREAD_COLOR)
    img = img.astype(np.float32)
    h, w, d = img.shape
    # resize if > max size
    if h > w and h > image_max_size:
        w = (float(image_max_size) / float(h)) * w
        img = cv2.resize(img, dsize=(int(w), image_max_size), interpolation=cv2.INTER_AREA)
    if w > image_max_size:
        h = (float(image_max_size) / float(w)) * h
        img = cv2.resize(img, dsize=(image_max_size, int(h)), interpolation=cv2.INTER_AREA)
    img = preprocess(img)
    return img

def preprocess(img):
    # bgr to rgb
    img = img[...,::-1]
    # shape (h, w, d) to (1, h, w, d)
    img = img[np.newaxis,:,:,:]
    img -= np.array([123.68, 116.779, 103.939]).reshape((1,1,1,3))
    return img

def get_style_images(style_images, content_image):
    _, ch, cw, cd = content_image.shape
    style_imgs = []
    for style_fn in style_images:
        # bgr image
        img = cv2.imread(style_fn, cv2.IMREAD_COLOR)
        img = img.astype(np.float32)
        img = cv2.resize(img, dsize=(cw, ch), interpolation=cv2.INTER_AREA)
        img = preprocess(img)
        style_imgs.append(img)
    return style_imgs

def get_initial_image(generated_image_initialization, content_image, style_images, noise_ratio, frame=None):
    if generated_image_initialization == 'content':
        return content_image
    elif generated_image_initialization == 'style':
        return style_images[0]
    elif generated_image_initialization == 'random':
        noise_image = np.random.uniform(-20., 20., content_image.shape).astype(np.float32)
        initial_image = noise_ratio * noise_image + (1.-noise_ratio) * content_image
        return initial_image
  # only for video frames
    #elif init_type == 'prev':
        #init_img = get_prev_frame(frame)
        #return init_img
    #elif init_type == 'prev_warped':
        #init_img = get_prev_warped_frame(frame)
        #return init_img
        


In [4]:
# debug_piece
#print(inputs['content_image'])
#for style_image in inputs['style_images']:
    #print(style_image)
#for mask_image in inputs['style_masks_images']:
    #print(mask_image)
#content_image = get_content_image(inputs['content_image'], inputs['image_max_size'])
#style_images = get_style_images(inputs['style_images'], content_image)
#initial_image = get_initial_image(inputs['generated_image_initialization'], content_image, style_images, inputs['noise_ratio'])


In [5]:
def conv_layer(layer_name, layer_input, W, verbose = True):
    conv = tf.nn.conv2d(layer_input, W, strides=[1, 1, 1, 1], padding='SAME')
    if verbose: print('--{} | shape={} | weights_shape={}'.format(layer_name, 
        conv.get_shape(), W.get_shape()))
    return conv

def relu_layer(layer_name, layer_input, b, verbose = True):
    relu = tf.nn.relu(layer_input + b)
    if verbose: 
        print('--{} | shape={} | bias_shape={}'.format(layer_name, relu.get_shape(), b.get_shape()))
    return relu

def pool_layer(layer_name, layer_input, verbose = True, pooling_type = 'avg'):
    if pooling_type == 'avg':
        pool = tf.nn.avg_pool(layer_input, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    elif pooling_type == 'max':
        pool = tf.nn.max_pool(layer_input, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    if verbose: 
        print('--{}   | shape={}'.format(layer_name, pool.get_shape()))
    return pool

def get_weights(vgg_layers, i):
    weights = vgg_layers[i][0][0][2][0][0]
    W = tf.constant(weights)
    return W

def get_bias(vgg_layers, i):
    bias = vgg_layers[i][0][0][2][0][1]
    b = tf.constant(np.reshape(bias, (bias.size)))
    return b

In [6]:
def build_model(input_image, model_weights, pooling_type, verbose = True):
    if verbose: print('\nBUILDING VGG-19 NETWORK')
    net = {}
    _, h, w, d     = input_image.shape
  
    if verbose: print('loading model weights...')
    vgg_rawnet     = scipy.io.loadmat(model_weights)
    vgg_layers     = vgg_rawnet['layers'][0]
    if verbose: print('constructing layers...')
    net['input']   = tf.Variable(np.zeros((1, h, w, d), dtype=np.float32))

    if verbose: print('LAYER GROUP 1')
    net['conv1_1'] = conv_layer('conv1_1', net['input'], W=get_weights(vgg_layers, 0))
    net['relu1_1'] = relu_layer('relu1_1', net['conv1_1'], b=get_bias(vgg_layers, 0))
    net['conv1_2'] = conv_layer('conv1_2', net['relu1_1'], W=get_weights(vgg_layers, 2))
    net['relu1_2'] = relu_layer('relu1_2', net['conv1_2'], b=get_bias(vgg_layers, 2))
    net['pool1']   = pool_layer('pool1', net['relu1_2'], pooling_type)

    if verbose: print('LAYER GROUP 2')  
    net['conv2_1'] = conv_layer('conv2_1', net['pool1'], W=get_weights(vgg_layers, 5))
    net['relu2_1'] = relu_layer('relu2_1', net['conv2_1'], b=get_bias(vgg_layers, 5))
    net['conv2_2'] = conv_layer('conv2_2', net['relu2_1'], W=get_weights(vgg_layers, 7))
    net['relu2_2'] = relu_layer('relu2_2', net['conv2_2'], b=get_bias(vgg_layers, 7))
    net['pool2']   = pool_layer('pool2', net['relu2_2'], pooling_type)
  
    if verbose: print('LAYER GROUP 3')
    net['conv3_1'] = conv_layer('conv3_1', net['pool2'], W=get_weights(vgg_layers, 10))
    net['relu3_1'] = relu_layer('relu3_1', net['conv3_1'], b=get_bias(vgg_layers, 10))
    net['conv3_2'] = conv_layer('conv3_2', net['relu3_1'], W=get_weights(vgg_layers, 12))
    net['relu3_2'] = relu_layer('relu3_2', net['conv3_2'], b=get_bias(vgg_layers, 12))
    net['conv3_3'] = conv_layer('conv3_3', net['relu3_2'], W=get_weights(vgg_layers, 14))
    net['relu3_3'] = relu_layer('relu3_3', net['conv3_3'], b=get_bias(vgg_layers, 14))
    net['conv3_4'] = conv_layer('conv3_4', net['relu3_3'], W=get_weights(vgg_layers, 16))
    net['relu3_4'] = relu_layer('relu3_4', net['conv3_4'], b=get_bias(vgg_layers, 16))
    net['pool3']   = pool_layer('pool3', net['relu3_4'], pooling_type)

    if verbose: print('LAYER GROUP 4')
    net['conv4_1'] = conv_layer('conv4_1', net['pool3'], W=get_weights(vgg_layers, 19))
    net['relu4_1'] = relu_layer('relu4_1', net['conv4_1'], b=get_bias(vgg_layers, 19))
    net['conv4_2'] = conv_layer('conv4_2', net['relu4_1'], W=get_weights(vgg_layers, 21))
    net['relu4_2'] = relu_layer('relu4_2', net['conv4_2'], b=get_bias(vgg_layers, 21))
    net['conv4_3'] = conv_layer('conv4_3', net['relu4_2'], W=get_weights(vgg_layers, 23))
    net['relu4_3'] = relu_layer('relu4_3', net['conv4_3'], b=get_bias(vgg_layers, 23))
    net['conv4_4'] = conv_layer('conv4_4', net['relu4_3'], W=get_weights(vgg_layers, 25))
    net['relu4_4'] = relu_layer('relu4_4', net['conv4_4'], b=get_bias(vgg_layers, 25))
    net['pool4']   = pool_layer('pool4', net['relu4_4'], pooling_type)

    if verbose: print('LAYER GROUP 5')
    net['conv5_1'] = conv_layer('conv5_1', net['pool4'], W=get_weights(vgg_layers, 28))
    net['relu5_1'] = relu_layer('relu5_1', net['conv5_1'], b=get_bias(vgg_layers, 28))
    net['conv5_2'] = conv_layer('conv5_2', net['relu5_1'], W=get_weights(vgg_layers, 30))
    net['relu5_2'] = relu_layer('relu5_2', net['conv5_2'], b=get_bias(vgg_layers, 30))
    net['conv5_3'] = conv_layer('conv5_3', net['relu5_2'], W=get_weights(vgg_layers, 32))
    net['relu5_3'] = relu_layer('relu5_3', net['conv5_3'], b=get_bias(vgg_layers, 32))
    net['conv5_4'] = conv_layer('conv5_4', net['relu5_3'], W=get_weights(vgg_layers, 34))
    net['relu5_4'] = relu_layer('relu5_4', net['conv5_4'], b=get_bias(vgg_layers, 34))
    net['pool5']   = pool_layer('pool5', net['relu5_4'], pooling_type)

    return net

In [7]:
def get_mask_image(mask_img, width, height):
    img = cv2.imread(mask_img, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, dsize=(width, height), interpolation=cv2.INTER_AREA)
    img = img.astype(np.float32)
    mx = np.amax(img)
    img /= mx
    return img

def mask_style_layer(a, x, mask_img):
    _, h, w, d = a.get_shape()
    mask = get_mask_image(mask_img, w.value, h.value)
    mask = tf.convert_to_tensor(mask)
    tensors = []
    for _ in range(d.value): 
        tensors.append(mask)
    mask = tf.stack(tensors, axis=2)
    #mask = tf.stack(mask, axis=0) 
    mask = tf.expand_dims(mask, 0)
    a = tf.multiply(a, mask)
    x = tf.multiply(x, mask)
    return a, x

In [8]:
def style_layer_loss(a, x):
    _, h, w, d = a.get_shape()
    M = h.value * w.value
    N = d.value
    A = gram_matrix(a, M, N)
    G = gram_matrix(x, M, N)
    loss = (1./(4 * N**2 * M**2)) * tf.reduce_sum(tf.pow((G - A), 2))
    return loss

def gram_matrix(x, area, depth):
    F = tf.reshape(x, (area, depth))
    G = tf.matmul(tf.transpose(F), F)
    return G

def content_layer_loss(p, x, content_loss_function):
    _, h, w, d = p.get_shape()
    M = h.value * w.value
    N = d.value
    if content_loss_function   == 1:
        K = 1. / (2. * N**0.5 * M**0.5)
    elif args.content_loss_function == 2:
        K = 1. / (N * M)
    elif args.content_loss_function == 3:  
        K = 1. / 2.
    loss = K * tf.reduce_sum(tf.pow((x - p), 2))
    return loss

In [9]:
def sum_masked_style_losses(sess, net, style_images, style_images_weights, style_masks_images, style_layers, style_layers_weights):
    total_style_loss = 0.0
    weights = style_images_weights
    masks = style_masks_images
    for img, img_weight, img_mask in zip(style_images, weights, masks):
        sess.run(net['input'].assign(img))
        style_loss = 0.0
        for layer, weight in zip(style_layers, style_layers_weights):
            a = sess.run(net[layer])
            x = net[layer]
            a = tf.convert_to_tensor(a)
            a, x = mask_style_layer(a, x, img_mask)
            style_loss += style_layer_loss(a, x) * weight
        style_loss /= float(len(style_layers))
        total_style_loss += (style_loss * img_weight)
    total_style_loss /= float(len(style_images))
    return total_style_loss

def sum_style_losses(sess, net, style_images, style_images_weights, style_layers, style_layers_weights):
    total_style_loss = 0.0
    weights = style_images_weights
    for img, img_weight in zip(style_images, weights):
        # for each style image and the corresponding weight runs the net and caclulates activations
        sess.run(net['input'].assign(img))
        style_loss = 0.0
        for layer, weight in zip(style_layers, style_layers_weights):
            # for each style layer and the corresponding weight gets the activation
            a = sess.run(net[layer])
            x = net[layer]
            # x is the variable activation of the net at layer given generated image
            a = tf.convert_to_tensor(a)
            style_loss += style_layer_loss(a, x) * weight
        style_loss /= float(len(style_layers))
        total_style_loss += (style_loss * img_weight)
    total_style_loss /= float(len(style_images))
    return total_style_loss

def sum_content_losses(sess, net, content_image, content_layers, content_layers_weights, content_loss_function):
    sess.run(net['input'].assign(content_image))
    content_loss = 0.0
    for layer, weight in zip(content_layers, content_layers_weights):
        p = sess.run(net[layer])
        x = net[layer]
        p = tf.convert_to_tensor(p)
        content_loss += content_layer_loss(p, x, content_loss_function) * weight
    content_loss /= float(len(content_layers))
    return content_loss



In [10]:
#model = build_model(content_image, inputs['model_weights'], inputs['pooling_type'], inputs['verbose'])
#with tf.device(inputs['device']), tf.Session() as sess:
    #L_style = sum_masked_style_losses(sess, model, style_images, inputs['style_images_weights'],
                                     #inputs['style_masks_images'], inputs['style_layers'], inputs['style_layers_weights'])
    #L_style = sum_style_losses(sess, model, style_images,
                               #inputs['style_images_weights'], inputs['style_layers'], inputs['style_layers_weights'])
    #L_content = sum_content_losses(sess, model, content_image, inputs['content_layers'], inputs['content_layers_weights'],
                                  #inputs['content_loss_function'])
    #L_tv = total_variation


In [11]:
def convert_to_original_colors(content_image, stylized_image, color_convert_type):
    content_image  = postprocess(content_image)
    stylized_image = postprocess(stylized_image)
    if color_convert_type == 'yuv':
        cvt_type = cv2.COLOR_BGR2YUV
        inv_cvt_type = cv2.COLOR_YUV2BGR
    elif color_convert_type == 'ycrcb':
        cvt_type = cv2.COLOR_BGR2YCR_CB
        inv_cvt_type = cv2.COLOR_YCR_CB2BGR
    elif color_convert_type == 'luv':
        cvt_type = cv2.COLOR_BGR2LUV
        inv_cvt_type = cv2.COLOR_LUV2BGR
    elif color_convert_type == 'lab':
        cvt_type = cv2.COLOR_BGR2LAB
        inv_cvt_type = cv2.COLOR_LAB2BGR
    content_cvt = cv2.cvtColor(content_image, cvt_type)
    stylized_cvt = cv2.cvtColor(stylized_image, cvt_type)
    c1, _, _ = cv2.split(stylized_cvt)
    _, c2, c3 = cv2.split(content_cvt)
    merged = cv2.merge((c1, c2, c3))
    dst = cv2.cvtColor(merged, inv_cvt_type).astype(np.float32)
    dst = preprocess(dst)
    return dst

def postprocess(img):
    img += np.array([123.68, 116.779, 103.939]).reshape((1,1,1,3))
    # shape (1, h, w, d) to (h, w, d)
    img = img[0]
    img = np.clip(img, 0, 255).astype('uint8')
    # rgb to bgr
    img = img[...,::-1]
    return img

In [12]:
def write_image(path, img):
    img = postprocess(img)
    cv2.imwrite(path, img)

In [13]:
#print(L_content)

In [14]:
def stylize(content_image, style_images, initial_image, inputs, frame = None):
    with tf.device(inputs['device']), tf.Session() as sess:
        # setup network
        net = build_model(content_image, inputs['model_weights'], inputs['pooling_type'], inputs['verbose'])

        # style loss
        if inputs['style_mask']:
            L_style = sum_masked_style_losses(sess, net, style_images, inputs['style_images_weights'],
                                              inputs['style_masks_images'], inputs['style_layers'],
                                              inputs['style_layers_weights'])
        else:
            L_style = sum_style_losses(sess, net, style_images, inputs['style_images_weights'],
                                       inputs['style_layers'], inputs['style_layers_weights'])

        # content loss
        L_content = sum_content_losses(sess, net, content_image, inputs['content_layers'],
                                       inputs['content_layers_weights'], inputs['content_loss_function'])

        # denoising loss
        L_tv = tf.image.total_variation(net['input'])

        # total loss
        L_total  = inputs['content_weight'] * L_content
        L_total += inputs['style_weight']  * L_style
        L_total += inputs['total_variation_weight'] * L_tv

        # video temporal loss
        #if args.video and frame > 1:
          #gamma      = args.temporal_weight
          #L_temporal = sum_shortterm_temporal_losses(sess, net, frame, init_img)
          #L_total   += gamma * L_temporal

        # optimization algorithm
        if inputs['optimizer_function'] == 'lbfgs':
            optimizer = tf.contrib.opt.ScipyOptimizerInterface(L_total, method='L-BFGS-B')
            if inputs['verbose']: print('\nMINIMIZING LOSS USING: L-BFGS OPTIMIZER')
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            sess.run(net['input'].assign(initial_image))
            optimizer.minimize(sess)
        elif inputs['optimizer_function'] == 'adam':
            optimizer = tf.train.AdamOptimizer(inputs['learning_rate'])
            if inputs['verbose']: print('\nMINIMIZING LOSS USING: ADAM OPTIMIZER')
            train_op = optimizer.minimize(L_total)
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            sess.run(net['input'].assign(initial_image))
            iterations = 0
            while (iterations < inputs['max_iterations']):
                sess.run(train_op)
                if iterations % 20 == 0 and inputs['verbose']:
                    curr_loss = L_total.eval()
                    print("At iterate {}\tf=  {}".format(iterations, curr_loss))
                iterations += 1

        output_image = sess.run(net['input'])

        if inputs['original_colors']:
            output_image = convert_to_original_colors(np.copy(content_image), output_image, inputs['color_convert_type'])

        #if args.video:
          #write_video_output(frame, output_img)
        #else:
        write_image('generated_image.png', output_image)


In [15]:
def render_single_image(inputs):
    content_image = get_content_image(inputs['content_image'], inputs['image_max_size'])
    style_images = get_style_images(inputs['style_images'], content_image)
    with tf.Graph().as_default():
        print('\n---- RENDERING SINGLE IMAGE ----\n')
        initial_image = get_initial_image(inputs['generated_image_initialization'],
                                          content_image, style_images, inputs['noise_ratio'])
        tick = time.time()
        stylize(content_image, style_images, initial_image, inputs)
        tock = time.time()
        print('Single image elapsed time: {}'.format(tock - tick))

In [16]:
render_single_image(inputs)


---- RENDERING SINGLE IMAGE ----


BUILDING VGG-19 NETWORK
loading model weights...
constructing layers...
LAYER GROUP 1
--conv1_1 | shape=(1, 700, 525, 64) | weights_shape=(3, 3, 3, 64)
--relu1_1 | shape=(1, 700, 525, 64) | bias_shape=(64,)
--conv1_2 | shape=(1, 700, 525, 64) | weights_shape=(3, 3, 64, 64)
--relu1_2 | shape=(1, 700, 525, 64) | bias_shape=(64,)
--pool1   | shape=(1, 350, 263, 64)
LAYER GROUP 2
--conv2_1 | shape=(1, 350, 263, 128) | weights_shape=(3, 3, 64, 128)
--relu2_1 | shape=(1, 350, 263, 128) | bias_shape=(128,)
--conv2_2 | shape=(1, 350, 263, 128) | weights_shape=(3, 3, 128, 128)
--relu2_2 | shape=(1, 350, 263, 128) | bias_shape=(128,)
--pool2   | shape=(1, 175, 132, 128)
LAYER GROUP 3
--conv3_1 | shape=(1, 175, 132, 256) | weights_shape=(3, 3, 128, 256)
--relu3_1 | shape=(1, 175, 132, 256) | bias_shape=(256,)
--conv3_2 | shape=(1, 175, 132, 256) | weights_shape=(3, 3, 256, 256)
--relu3_2 | shape=(1, 175, 132, 256) | bias_shape=(256,)
--conv3_3 | shape=(1, 175, 