**SOW-MKI49: Neural Information Processing Systems**  
*Weeks 4 and 5: Assignment (225 points + 30 bonus points)*  
Author: Umut

In [None]:
# Group number: ...
# Student 1 name, student 1 number: ...
# Student 2 name, student 2 number: ...
# Student 3 name, student 3 number: ...

In [1]:
import os
import numpy as np
import cv2
import pickle
import random
import pandas as pd
import tensorflow as tf
from glob import glob
from time import time
import tqdm

In [2]:
epochs = 10
batch_size = 32

**WaveNet component (75 points)**

* Implement missing parts of the call method (y and z). **25 points**
* Implement residual block class. **50 points**

---
Reminder:

* One convolution layer that has 61 kernels of size 2 with no nonlinearities.
![alt text](http://i67.tinypic.com/21mgi2w.png)
![alt text](http://i67.tinypic.com/292n04y.png)
---



In [3]:
root_dir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__'))))
data_directory = os.path.join(root_dir, 'lfw-piano_roll_data')

In [4]:
with open(os.path.join('piano_rolls.p'), 'rb') as f:
    piano_rolls = pickle.load(f)

In [5]:
keys = sorted(piano_rolls.keys())

random.seed(6)
random.shuffle(keys)

test_set = dict((key, piano_rolls[key]) for key in keys[:int(0.1 * len(keys))])
training_set = dict((key, piano_rolls[key]) for key in keys[int(0.1 * len(keys)):])
training_set_keys = list(training_set.keys())

In [6]:
def residual_block(y, i):
    
    short = y
    
    split1, split2 = tf.split(y, [61, 61], axis = 1)

    y1 = tf.nn.conv2d(split1, 61, 1, dilations = [1, 1, i, 1], padding = 'VALID', data_format = 'NCHW')
    y2 = tf.nn.conv2d(split2, 61, 1, dilations = [1, 1, i, 1], padding = 'VALID', data_format = 'NCHW')

    y1_tan = tf.nn.tanh(y1)
    y2_sig = tf.nn.sigmoid(y2)

    y_mul = tf.multiply(y1_tan, y2_sig)
    
    parallel_conv1 = tf.nn.conv2d(y_mul, 61, 1, padding = 'VALID', data_format = 'NCHW')
    
    y_ = tf.add(short, parallel_conv1) #(B)
        
    if i != 32:
        fin = tf.nn.conv2d(y_mul, 512, 1, padding = 'VALID', data_format = 'NCHW')
        return tf.nn.relu(fin)
        
    elif i == 32:
        return y_

In [7]:
def _crf(k, psi_u):
    
    q = psi_u
    
    #Message passing layer
    
    y = tf.matmul(q, k)
    
    #Compatibility transform layer
    
    ctl = tf.nn.convolution(y, 2, strides = 1, padding = 'VALID', data_format = 'NCHW')
    
    #Local update and normalization layer
    
    z = tf.subtract(-psi_u, ctl)
    
    for i in range(5):
        
        if i == 4:
            return z
            
        elif i < 4:
            z = tf.nn.softmax(z, axis = 1)
    

In [14]:
def waveCRF(input_):

    #input_shape = (1, 61, 80)

    #batch_shape = (None,) + input_shape

    #input_ = tf.placeholder(tf.float32, [None, 1, 61, 80])

    paddings = tf.constant([[0, 0], [0, 0], [0, 0], [1, 0]])  # (batch_size, channels, notes, time)

    padded_input = tf.pad(input_, paddings, mode='CONSTANT', constant_values=0.0)
    
    input1 = tf.cast(padded_input, tf.float32)
    #padded_shape = input1.shape
    print(input1.shape)
    #print(padded_shape)
    #input1 = Input(padded_shape, batch_shape, tensor = padded_input)'''

    model = tf.nn.conv2d(input1, 61 * 2, (1, 2), padding = 'VALID', data_format = 'NCHW')

    res1 = residual_block(model, 1)

    res2 = residual_block(res1, 2)

    res3 = residual_block(res2, 4)

    res4 = residual_block(res3, 8)

    res5 = residual_block(res4, 16)

    res6 = residual_block(res5, 32)

    model = tf.nn.conv2d(res6, 3843, 1, padding = 'VALID')

    out1, out2 = tf.split(model, [3721, 122], axis = 1)

    out_shape1 = list(tf.int_shape(out1))

    out_shape2 = list(tf.int_shape(out2))

    wave_out1 = tf.reshape(out1, (out_shape1[0], 61, 61, out_shape1[3]))  #k

    wave_out2 = tf.reshape(out2, (out_shape2[0], 2, 61, out_shape2[3]))  #psi_u

    return _crf(wave_out1, wave_out2)

In [15]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for epoch in tqdm.tnrange(epochs):
        random.shuffle(training_set_keys)

        batch = ()

        for key in tqdm.tqdm_notebook(training_set_keys, leave = False):
            i = random.randint(0, training_set[key].shape[1] - 80)
            batch += (training_set[key][32 : 93, i : i + 80],)

            if len(batch) == batch_size:
                batch = np.asarray(batch)
                #batch = tf.reshape(batch, [batch_size, 1, 61, 80])
                
                Q = batch[:, :, 1:].astype('i')
                #print(batch.shape, Q.shape)
                print(batch[:, :, None, :-1].shape)
                Q_hat = waveCRF(batch[:, :, None, :-1].astype('f'))

                train_loss = tf.losses.softmax_cross_entropy(Q, logits = Q_hat)
                Optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(train_loss, var_list = t_vars)
                _, lossV, _trainY, _predict = sess.run([Optimizer, train_loss, Q, Q_hat])
                _label = np.argmax(_trainY, axis=1)
                _accuracy = np.mean(_label == _predict)
                plot.plot('loss', lossV)
                plot.plot('train accuracy', _accuracy)


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


(32, 61, 1, 79)
(32, 61, 1, 80)



ValueError: Shape must be rank 4 but is rank 0 for 'Conv2D_1' (op: 'Conv2D') with input shapes: [32,61,1,80], [].

In [None]:
batch_loss = 

In [None]:
t_vars = tf.trainable_variables()

In [None]:
Optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(loss, var_list = t_vars)

In [None]:
for epoch in tqdm.tnrange(epochs):
    random.shuffle(training_set_keys)

    batch = ()

    for key in tqdm.tqdm_notebook(training_set_keys, leave = False):
        i = random.randint(0, training_set[key].shape[1] - 80)
        batch += (training_set[key][32 : 93, i : i + 80],)
        
        if len(batch) == batch_size:
            batch = np.asarray(batch)
            
            Q_hat = waveCRF(batch[:, :, 1:].astype('i'))
            
            train_loss = tf.losses.softmax_cross_entropy(batch[:, :, None, :-1].astype('f'), logits = Q_hat)

In [None]:
[2 ** (i % 6) for i in range(6)]

In [None]:
class _WaveNet(ChainList):
    def __init__(self):
        links = (L.Convolution2D(61, 2 * 61, (1, 2)),)
        
        links += tuple(_ResidualBlock((1, 2 ** (i % 6))) for i in range(6))
        links += (L.Convolution2D(512, 512, 1), L.Convolution2D(512, 3843, 1))

        super(_WaveNet, self).__init__(*links)

    def __call__(self, x):
        y = (self[0](F.pad(x, ((0, 0), (0, 0), (0, 0), (1, 0)), 'constant')),)
        z = 0
        h = F.split_axis(layer(x), 2, 1)
        y = F.sigmoid(h[0]) * F.tanh(h[1])

        for i in range(1, len(self) - 2):
            y = self[i]()
            #z +=

        #y, z =

        return F.reshape(y, (y.shape[0], 61, 61, y.shape[3])), \
               F.reshape(z, (z.shape[0], 2, 61, z.shape[3]))

class _ResidualBlock(ChainList):
    def __init__(self):
        
        with self.init_scope():
            self.convolution2D_0 = L.convolution2D(in_channels, 64, (1, 1))
            self.convolution2D_1 = L.convolution2D(in_channels, 512, 1)
            
            
    def __call__(self, x):
        
        
        
        
    pass

**CRF-RNN component (50 points)**

* Implement missing parts of the call method (z). **25 points**
* Why is z not normalized in the last iteration? **25 points**

---

Reminder:

![alt text](http://i68.tinypic.com/sy6mix.png)

---

In [None]:
class _CRF(ChainList):
    def __init__(self):
        super(_CRF, self).__init__(L.ConvolutionND(1, 2, 2, 1, nobias = True))

    def __call__(self, x, y):
        #z =

        for i in range(5):
            #z =

            if i < 4:
                z = F.softmax(z)

        return z

**WaveCRF model (50 points)**

1. Implement missing parts of the call method (k, psi_u and Q_hat). **20 points**
2. Implement missing parts of the save and load methods (save and load model). **10 points**
3. Implement missing parts of the test and train methods (forward and/or backward propagate). **20 points**

In [None]:
class WaveCRF(object):
    def __init__(self):
        self.log = {('test', 'accuracy'): (), ('test', 'loss'): (), ('training', 'accuracy'): (),
                    ('training', 'loss'): ()}
        self.model = ChainList(_WaveNet(), _CRF())
        self.optimizer = optimizers.Adam(0.0002, 0.5)

        self.optimizer.setup(self.model)

    def __call__(self, x):
        #k, psi_u =
        #Q_hat =

        return F.transpose(F.reshape(Q_hat, (x.shape[0], x.shape[3], 2, 61)), (0, 2, 3, 1))

    @classmethod
    def load(cls, directory):
        self = cls()
        self.log = np.load('{}/log.npy'.format(directory))

        # Load model
        serializers.load_npz('{}/optimizer.npz'.format(directory), self.optimizer)

        return self

    def save(self, directory):
        np.save('{}/log.npy'.format(directory), self.log)
        # Save model
        serializers.save_npz('{}/optimizer.npz'.format(directory), self.optimizer)

    def test(self, Q, x):
        with chainer.using_config('train', False):
            # Forward prop
            # Forward prop

            self.log['test', 'accuracy'] += (float(F.accuracy(Q_hat, Q).data),)
            self.log['test', 'loss'] += (float(loss.data),)

    def train(self, Q, x):
        # Forward prop
        # Forward prop

        # Backprop
        # Backprop
        # Backprop

        self.log['training', 'accuracy'] += (float(F.accuracy(Q_hat, Q).data),)
        self.log['training', 'loss'] += (float(loss.data),)

In [None]:
%matplotlib inline

import IPython
import chainer
import matplotlib
import numpy
import os
import pickle
import random
import tqdm

In [None]:
batch_size = 30
epochs = 100
root = '..'

In [None]:
with open('{}/Data/piano_rolls.p'.format(root), 'rb') as f:
    piano_rolls = pickle.load(f)

keys = sorted(piano_rolls.keys())

random.seed(6)
random.shuffle(keys)

test_set = dict((key, piano_rolls[key]) for key in keys[:int(0.1 * len(keys))])
training_set = dict((key, piano_rolls[key]) for key in keys[int(0.1 * len(keys)):])
training_set_keys = list(training_set.keys())

In [None]:
waveCRF = WaveCRF()

waveCRF.model.to_gpu()

In [None]:
for epoch in tqdm.tnrange(epochs):
    random.shuffle(training_set_keys)

    batch = ()

    for key in tqdm.tqdm_notebook(training_set_keys, leave = False):
        i = random.randint(0, training_set[key].shape[1] - 80)
        batch += (training_set[key][32 : 93, i : i + 80],)

        if len(batch) == batch_size:
            batch = waveCRF.model.xp.array(batch)

            waveCRF.train(batch[:, :, 1:].astype('i'), batch[:, :, None, :-1].astype('f'))

            batch = ()

    for key in tqdm.tqdm_notebook(test_set, leave = False):
        batch = waveCRF.model.xp.array((test_set[key][32 : 93],))

        waveCRF.test(batch[:, :, 1:].astype('i'), batch[:, :, None, :-1].astype('f'))

    IPython.display.clear_output()

    for i, key in enumerate(waveCRF.log):
        matplotlib.pyplot.subplot(221 + i)
        matplotlib.pyplot.plot(numpy.array(waveCRF.log[key]).reshape(epoch + 1, -1).mean(1))
        matplotlib.pyplot.xlabel('iteration')
        matplotlib.pyplot.ylabel(key)

    matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.show()
    os.makedirs('{}/Models/WaveCRF/{}'.format(root, epoch))
    waveCRF.save('{}/Models/WaveCRF/{}'.format(root, epoch))

**Test (50 points)**  

* Generate a number of samples, pick the best one and play it in the notebook. **50 points**

In [None]:
# Test

**Bonus question (30 points)**

* Discuss how you can improve the model (you can talk about different architectures or different ways to encode the inputs, etc.) **10 points**
* Discuss the assumptions behind the meanfield approximation and its shortcomings. **10 points**
* Prove that the iterative update equation (CRF-RNN component) is differentiable so that we can backpropagate through them. **10 points**