In [508]:
import tensorflow as tf
import numpy as np
import os
import wave

In [525]:
class Model(object):
    def __init__(self, architecture, files, model_path='./models/dbm.ckpt'):
        self.model_path = model_path
        self.architecture = architecture
        self.transfer_fct = tf.nn.relu
        self.files = files
        
        self.x = tf.placeholder(tf.float32, [None, architecture["input"]])
        
        self._int_params(architecture)
        self._create_loss_optimizer()
        
        init = tf.global_variables_initializer()
        
        self.tf_saver = tf.train.Saver()
        
    def load_model(self, path=None):
        self.tf_sess = tf.Session()
        self.tf_sess.run(tf.global_variables_initializer())
        self.tf_saver.restore(self.tf_sess, path if path != None else self.model_path)
        self.loaded_model = True
        
    def normalize(x, get_mu_std=False):
        mean, var = tf.nn.moments(x, axes=[0])
        if(get_mu_std):
            return (x - mu)/var, mu, var
        else:
            return (x - mu)/var

    def restore(x, mu, var):
        return (x*var + mu) - tf.reduce_min(x)
        
    def get_data_generator(file_path, frame_with, frame_count, batch_count, rand=False, get_meta=False, get_label=False):
        if(type(file_path ) == str):
            wave_read = [wave.open(file_path, "rb")]
        else:
            wave_read = []
            for fp in file_path:
                wave_read.append (wave.open(fp, "rb"))
        if(get_meta):
            meta = {
                "nframes" : wave_read[0].getnframes(),
                "nchannels" : wave_read[0].getnchannels(),
                "sampwidth" : wave_read[0].getsampwidth(),
                "framerate" : wave_read[0].getframerate()
            }
        for k in range(batch_count):
            out = []
            labels = []
            for i in range(frame_count):
                label = np.random.randint(len(wave_read)) - 1
                chosen_file = wave_read[label]
                if rand:
                    chosen_file.setpos(np.random.randint(chosen_file.getnframes() - frame_with))           
                if(get_label == True):
                    labels.append(label)
                out.append(np.fromstring(chosen_file.readframes(frame_with), np.uint16))
            yield [out, labels] if(get_label == True) else out
        for fr in wave_read:
            fr.close()
        yield meta if get_meta else None
        
    def train(self, epoch_count, batch_size, learning_rate):
        data_generator = self.get_data_generator(self.files, self.architecture['input'], batch_size, epoch_count, True, False, True)
        
        normal_x = self.normalize(self.x)
        z, z_mean, z_log_sigma_sq = self.encode(normal_x)
        reconstruct = self.decode(z)
        reconstr_loss = tf.reduce_mean(tf.pow(normal_x - reconstruct, 2))
        latent_loss = -0.001 * tf.reduce_sum(1 + z_log_sigma_sq 
                                           - tf.square(z_mean) 
                                           - tf.exp(z_log_sigma_sq), 1)
        
        class_y =  tf.placeholder(tf.float32, [None, len(self.files)])
        class_loss = 0.1 * tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=tf.matmul(z, self.params['class']['W']) 
                                                               + self.params['class']['b'], labels=class_y))
        
        self.cost = tf.reduce_mean(reconstr_loss + latent_loss + class_loss)

        self.optimizer = \
            tf.train.AdamOptimizer(learning_rate= learning_rate).minimize(self.cost)
        self.tf_sess = tf.Session()  
        for i in range(epoch_count):
            optimizer = self.tf_sess.run(self.optimizer, feed_dict={self.x: data_generator.__next__(), y: train_targets})
        
        data_generator.__next__()   
        self.tf_saver.save(self.tf_sess, self.model_path)
        
    def encode(self, data):
        z_mean, z_log_sigma_sq = self._forward_pass(self.params['rec'], data)
        eps = tf.random_normal((self.batch_size, self.architecture['z']), 0, 1, 
                               dtype=tf.float32)
        return tf.add(z_mean, tf.multiply(tf.sqrt(tf.exp(z_log_sigma_sq)), eps)), z_mean, z_log_sigma_sq
    
    def decode(self, data):
        return self._forward_pass(self.params['gen'], data)
            
    def _int_params(self, architecture):
        params = {
            'rec' : {
                'W' : {
                    'layers' : [],
                    'mean' : tf.Variable(xavier_init(architecture['rec'][-1], architecture['z'])),
                    'log_sigma' : tf.Variable(xavier_init(architecture['rec'][-1], architecture['z']))
                },
                'b' : {
                    'layers' : [],
                    'mean' : tf.Variable(tf.zeros([architecture['z']], dtype=tf.float32)),
                    'log_sigma' : tf.Variable(tf.zeros([architecture['z']], dtype=tf.float32))
                }
            },
            'gen' : {
                'W' : {
                    'layers' : [],
                    'mean' : tf.Variable(xavier_init(architecture['gen'][-1], architecture['z'])),
                    'log_sigma' : tf.Variable(xavier_init(architecture['gen'][-1], architecture['z']))
                },
                'b' : {
                    'layers' : [],
                    'mean' : tf.Variable(tf.zeros([architecture['input']], dtype=tf.float32)),
                    'log_sigma' : tf.Variable(tf.zeros([architecture['input']], dtype=tf.float32))
                }
            },
            'class' : {
                'W' : tf.Variable(xavier_init(architecture['z'], len(self.files))),
                'b' : tf.Variable(tf.zeros([architecture['input']], dtype=tf.float32))
            }
        }
        
        for i in range(len(architecture['rec'])):
            if(isinstance(architecture["rec"][i], list)):
                print('capsule to implement')   
            else:
                params['rec']['W']['layers'].append(tf.Variable(xavier_init(\
                    architecture["input"] if i==0 else architecture['rec'][i-1], \
                    architecture["rec"][i])))            
                params['rec']['b']['layers'].append(tf.Variable(tf.zeros([architecture["rec"][i]], dtype=tf.float32)))
             
        params['rec']['W']['mean'] = tf.Variable(xavier_init(architecture['rec'][i], architecture['z']))
        params['rec']['W']['log_sigma'] = tf.Variable(xavier_init(architecture['rec'][i], architecture['z']))
        
        for i in range(len(architecture['gen'])):
            if(isinstance(architecture["gen"][i], list)):
                print('capsule to implement')   
            else:
                params['gen']['W']['layers'].append(tf.Variable(xavier_init(\
                    architecture["z"] if i==0 else architecture['gen'][i-1], \
                    architecture["gen"][i])))            
                params['rec']['b']['layers'].append(tf.Variable(tf.zeros([architecture["gen"][i]], dtype=tf.float32)))
             
        params['rec']['W']['mean'] = tf.Variable(xavier_init(architecture['gen'][i], architecture['input']))
        params['rec']['W']['log_sigma'] = tf.Variable(xavier_init(architecture['gen'][i], architecture['input']))
        
        return params
            
    def _forward_pass(self, params, data):
        current_val = data
        for i in range(len(params['W']['layers'])):
            current_val = self.transfer_fct(tf.add(tf.matmul(current_val, params['W'][i]), 
                                           params['b']['layers'])) 

        z_mean = tf.add(tf.matmul(current_val, params['W']['mean']), params['b']['mean'])
        
        if(params['W']['log_sigma']):
            return (z_mean, tf.add(tf.matmul(layer_2, params['W']['log_sigma']), params['b']['out_log_sigma']))
        else:
            return z_mean
            
    def _create_loss_optimizer(self):
        # The loss is composed of two terms:
        # 1.) The reconstruction loss (the negative log probability
        #     of the input under the reconstructed Bernoulli distribution 
        #     induced by the decoder in the data space).
        #     This can be interpreted as the number of "nats" required
        #     for reconstructing the input when the activation in latent
        #     is given.
        # Adding 1e-10 to avoid evaluation of log(0.0)
        reconstr_loss = tf.reduce_mean(tf.pow(self.x - self.x_reconstr_mean, 2))
        # 2.) The latent loss, which is defined as the Kullback Leibler divergence 
        ##    between the distribution in latent space induced by the encoder on 
        #     the data and some prior. This acts as a kind of regularizer.
        #     This can be interpreted as the number of "nats" required
        #     for transmitting the the latent space distribution given
        #     the prior.
        latent_loss = -0.001 * tf.reduce_sum(1 + self.z_log_sigma_sq 
                                           - tf.square(self.z_mean) 
                                           - tf.exp(self.z_log_sigma_sq), 1)
        self.cost = tf.reduce_mean(reconstr_loss + latent_loss)   # average over batch

        # Use ADAM optimizer
        self.optimizer = \
            tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
        
    def partial_fit(self, X):
        """Train model based on mini-batch of input data.
        
        Return cost of mini-batch.
        """
        opt, cost = self.sess.run((self.optimizer, self.cost), 
                                  feed_dict={self.x: X})
        return cost
    
    def transform(self, X):
        """Transform data by mapping it into the latent space."""
        # Note: This maps to mean of distribution, we could alternatively
        # sample from Gaussian distribution
        return self.sess.run(self.z_mean, feed_dict={self.x: X})
    
    def generate(self, z_mu=None):
        """ Generate data by sampling from latent space.
        
        If z_mu is not None, data for this point in latent space is
        generated. Otherwise, z_mu is drawn from prior in latent 
        space.        
        """
        if z_mu is None:
            z_mu = np.random.normal(size=self.network_architecture["z"])
        # Note: This maps to mean of distribution, we could alternatively
        # sample from Gaussian distribution
        return self.sess.run(self.x_reconstr_mean, 
                             feed_dict={self.z: z_mu})
    
    def reconstruct(self, X):
        """ Use VAE to reconstruct given data. """
        return self.sess.run(self.x_reconstr_mean, 
                             feed_dict={self.x: X})

In [510]:
def norm(x, get_mu_std=False):
    ar = np.array(x)
    mu = np.mean(ar)
    std = np.std(ar)
    if(get_mu_std):
        return (ar - mu)/std, mu, std
    else:
        return (ar - mu)/std

def restore(x, mu, std):
    return np.array(x)*std + mu

def get_file_frame(file_path, frame_with, frame_count, batch_count, rand=False, get_meta=False, get_label=False):
    if(type(file_path ) == str):
        wave_read = [wave.open(file_path, "rb")]
    else:
        wave_read = []
        for fp in file_path:
            wave_read.append (wave.open(fp, "rb"))
    if(get_meta):
        meta = {
            "nframes" : wave_read[0].getnframes(),
            "nchannels" : wave_read[0].getnchannels(),
            "sampwidth" : wave_read[0].getsampwidth(),
            "framerate" : wave_read[0].getframerate()
        }
    for k in range(batch_count):
        out = []
        labels = []
        for i in range(frame_count):
            label = np.random.randint(len(wave_read)) - 1
            chosen_file = wave_read[label]
            if rand:
                chosen_file.setpos(np.random.randint(chosen_file.getnframes() - frame_with))           
            if(get_label == True):
                labels.append(label)
            out.append(np.fromstring(chosen_file.readframes(frame_with), np.uint16))
        yield [out, labels] if(get_label == True) else out
    for fr in wave_read:
        fr.close()
    yield meta if get_meta else None
    
def write_wave(array, path, meta, mu=None, std=None):
    unrolled = array.ravel()
    writer = wave.open(path, "wb")
    writer.setnframes(meta["nframes"])
    if(mu != None and std!=None):
        unrolled = restore(unrolled, mu, std)
    bytes_arr = np.rint(unrolled).astype(np.uint16).tobytes()
    writer.setnchannels(meta["nchannels"])
    writer.setsampwidth(meta["sampwidth"])
    writer.setframerate(meta["framerate"])
    writer.writeframes(bytes_arr)
    writer.close()

def xavier_init(fan_in, fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)

In [419]:
vnet = VNet([500, 300, 200, 100], "gauss")
vnet.files= ["01.wav", "03.wav", "04.wav", "05.wav", "06.wav", "07.wav", "11.wav" ,"13.wav", "14.wav"]
vnet.load_model()
vnet.noise_std = 0
vnet.init_vae(70)
#rbm.train(get_file_frame, 500, 0.0000015, 700, 0.002, 5)

False
init vae


In [None]:
vnet.train_vae(get_file_frame, 500, 0.00001, 1000)

In [387]:
#vnet = VNet([500, 300, 200, 100], "gauss")
#vnet.load_model()
gen = get_file_frame("01.wav", 500, 500, 1, False, True)
foo = gen.__next__()  
foo, mu, std = norm(foo, True)
res = vnet.encode(foo) 
meta = gen.__next__()
np.shape(res)

(500, 50)

In [352]:
decoded = vnet.decode(res)

In [353]:
write_wave(decoded, "ress.wav", meta, mu, std) 

In [18]:
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(0)
tf.set_random_seed(0)

In [521]:
network_architecture = {
    "input":1000,
    "rec" : [700, 500, 300, 150],
    "gen" : [100, 200, 500, 700],
    "z" : 100
} 

In [526]:
model = Model(network_architecture, ["01.wav", "03.wav", "04.wav", "05.wav", "06.wav", "07.wav", "11.wav" ,"13.wav", "14.wav"])

AttributeError: 'Model' object has no attribute 'x_reconstr_mean'

In [423]:
gen = get_file_frame( ["01.wav", "03.wav", "04.wav", "05.wav", "06.wav", "07.wav", "11.wav" ,"13.wav", "14.wav"], 500, 500, 10000, rand=True, get_meta=False, get_label=False);
def train(network_architecture, learning_rate=0.0001,
          batch_size=500, training_epochs=10000, display_step=100):
    vae = VariationalAutoencoder(network_architecture, 
                                 learning_rate=learning_rate, 
                                 batch_size=batch_size)
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(n_samples / batch_size)
        # Loop over all batches
        
        batch_xs= norm(gen.__next__());
        # Fit training using batch data
        cost = vae.partial_fit(batch_xs)
        # Compute average loss
        avg_cost += cost / batch_size

        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), 
                  "cost=", "{:.9f}".format(cost))
    return vae

In [424]:
vae = train(network_architecture)

Epoch: 0001 cost= 1.163512707
Epoch: 0101 cost= 1.020631671
Epoch: 0201 cost= 1.008952618
Epoch: 0301 cost= 1.000304461
Epoch: 0401 cost= 0.976029575
Epoch: 0501 cost= 0.940716863
Epoch: 0601 cost= 0.899858057
Epoch: 0701 cost= 0.852550328
Epoch: 0801 cost= 0.826211929
Epoch: 0901 cost= 0.799576521
Epoch: 1001 cost= 0.753931284
Epoch: 1101 cost= 0.752783835
Epoch: 1201 cost= 0.738080144
Epoch: 1301 cost= 0.716445208
Epoch: 1401 cost= 0.717823088
Epoch: 1501 cost= 0.681948721
Epoch: 1601 cost= 0.673137486
Epoch: 1701 cost= 0.649670243
Epoch: 1801 cost= 0.663625360
Epoch: 1901 cost= 0.661385357
Epoch: 2001 cost= 0.639850795
Epoch: 2101 cost= 0.648124099
Epoch: 2201 cost= 0.619280875
Epoch: 2301 cost= 0.621437252
Epoch: 2401 cost= 0.600855112
Epoch: 2501 cost= 0.610756814
Epoch: 2601 cost= 0.624804914
Epoch: 2701 cost= 0.598593235
Epoch: 2801 cost= 0.597748160
Epoch: 2901 cost= 0.590362430
Epoch: 3001 cost= 0.583675206
Epoch: 3101 cost= 0.615513563
Epoch: 3201 cost= 0.593134105
Epoch: 330

In [197]:
gen = get_file_frame("01.wav", 500, 500, 1, False, True)
foo = gen.__next__()  
foo, mu, std = norm(foo, True)
res = vae.reconstruct(foo)
meta = gen.__next__()
write_wave(res, "ress.wav", meta, mu, std) 

In [162]:
np.shape(foo)

(500, 500)

In [506]:
gen = get_file_frame("01.wav", 500, 500, 1, False, True)
foo1 = gen.__next__()  

In [507]:
meta = gen.__next__()
foo = np.add(foo1, 10000)
write_wave(foo, "ress.wav", meta) 

In [505]:
foo

array([[64535, -1000, -1000, ..., -1000, -1000, -1000],
       [-1000, -1000, -1000, ..., -1000, -1000, -1000],
       [-1000, 64535, 64535, ..., 64534, 64531, 64528],
       ..., 
       [64510, 64513, 64517, ..., 64502, 64502, 64500],
       [64496, 64494, 64491, ..., 64533, 64533, 64529],
       [64529, 64526, 64523, ..., 64497, 64503, 64505]], dtype=int32)