## Init and Run GMM

In [1]:
%%javascript
Jupyter.utils.load_extensions('tdb_ext/main')

<IPython.core.display.Javascript object>

In [2]:
#this sets the backend to jupyter/ipython that (i think) displays
#     images directly. anyway, it prevents the matplotlib framework
#     python error that is my least favorite thing eeeevvvveeeer.
%matplotlib notebook

import sys
sys.path.append("..")

import gmix_model as gmix
import numpy as np
import tdb as tdb
import tensorflow as tf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import gmix_sample_mixture as smpl

import os
os.chdir('/Users/azane/GitRepo/spider')

In [3]:
#read in training and test data
s_x, s_t = gmix.get_xt_from_npz('data/spi_gmix_train.npz', True)
t_x, t_t = gmix.get_xt_from_npz('data/spi_gmix_test.npz', True)

#TEMP
#expand target dimension so variance can be happy.
scaleOut = 100
s_t *= scaleOut
t_t *= scaleOut

### Note on Scaling and Variance Saturation:

   ```GaussianMixtureModel``` needs to handle the variance scaling. The question is, should it scale up from tanh and *then* calculate loss? Or should it keep everything within the tanh range, and then scale up only for outputs?
   
   If the actual range is large, then it would be more accurate and safer to scale up to the range. But, like in the spider example, if the range is small, so it would actually be safer to keep it at tanh. I think if it can be determined that the tanh range can accurately represent the means and variances, then it's best to keep it there. Otherwise, we might need to expand everything to a middle-man range where loss can be calculated, and then expand to the actual range on formula retrieval.

In [4]:
#create gmm with data
np.random.seed(np.random.randint(100000))
gmm = gmix.GaussianMixtureModel(s_x, s_t, t_x, t_t,
                               numGaussianComponents=5, hiddenLayerSize=20,
                               learningRate=0.004) #0.005 worked for 2d

### Thoughts on Hyperparameters

  * The larger the hidden layers, the more representations of globally best solutions. Thus, a slower training rate can be afforded, as there are more routes out of local minima.
  * Small hidden layers may require a larger training rate so it can jump out of local minima.
  * Examining the mixing coefficient averages reveals whether or not some gaussian components are not being used. These should be minimized.
 
### Hyperparameters as Variables

   * We need an **intelligent learning rate**. It should make guesses as to whether it's stuck in a local minima, or honing in on a good solution. If it thinks it's stuck, set the learning rate high to jump out, if it's working on a good solution, keep the learning rate low to stay on track.
      * the loss function needs to scale with the number of samples, otherwise we'll see steeper gradients for for larger sample batches.
   * The **number of gaussian components** can be selected based on how many are being used, and how much. Having this change during training would require a restructuring of the network, however, and preserving training before restructuring may be impossible.
      * in other words, this may slow down training considerably, but complexity reduction would vastly increase execution.
   * It may be worth spawning **a number of networks** working on the same solution. This is a good way to determine whether a **local or global** solution has been found.

# Train Step

In [15]:
%%capture
#d will be a dictionary of evaluated tensors under their standard name.
runTimes = 1000
reports = 10
assert reports>=1

gmm.learningRate=0.003
gmm.train(iterations=runTimes, testBatchSize=1000,
          trainBatchSize=5000, reportEvery=int(runTimes/reports))


# Debugging

In [6]:
def scale(x, y, low, high, i=-1):
    """takes two 2d arrays, but only checks the last y column."""
    indices = np.where((y[:,i] >= low) & (y[:,i] <= high))
    return x[indices], y[indices]

In [7]:
def init4d(x, y, yLow=None, yHigh=None):
    """Takes two 2d arrays, but only graphs the 
        first three x's, and the last y."""
    
    #TODO enable this to take two sets of x and y, 
    #    so they can be compared easily.
    
    assert y.ndim == 2
    assert y.shape[1] == 1
    assert x.ndim == 2
    
    if (yLow is not None) and (yHigh is not None):
        #TODO allow the output to be culled by only one.
        #exclude points outside of low/high range
        x, y = scale(x, y, yLow, yHigh)
    
    x = np.squeeze(x.transpose())
    y = np.squeeze(y[:,0])
    
    assert y.ndim == 1
    assert x.ndim == 2
    assert x.shape[0] == 3
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d') #create subplot in figure.
    
    #set the color map to the output dimension.
    scatPlot = ax.scatter(x[0], x[1], x[2], c=y) #create plot
    
    ax.set_xlabel("x1")
    ax.set_ylabel("x2")
    ax.set_zlabel("x3")
    
    #add colorbar label
    cb = fig.colorbar(scatPlot)
    cb.set_label("y")
    
    return fig, scatPlot

In [16]:
m, v, u = gmm.get_xmvu()
x, y = smpl.sample_mixture(t_x, m, v, u) #set to gmm sample
#x = t_x; y = t_t #set to actual values

#cull data to a set number of points.
lim = 1000
if x.shape[0] > lim:
    randRows = np.random.randint(x.shape[0],size=lim)
    x = x[randRows]
    y = y[randRows]

init4d(x, y, yLow=-0.01*scaleOut, yHigh=0.005*scaleOut)
#init4d(x, y)

<IPython.core.display.Javascript object>

(<matplotlib.figure.Figure at 0x17efd8ed0>,
 <mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x17f46dcd0>)

In [17]:
evalStr = [
    'calc_agg_grad_w1',
    'calc_agg_grad_b1',
    'calc_agg_grad_w2',
    'calc_agg_grad_b2',
    'calc_agg_grad_w3',
    'calc_agg_grad_b3',

    'v',
    'm',
    
    'w1',
    'w2',
    'w3',
    'b1',
    'b2',
    'b3'
    ]
d = gmm.get_evals(evalStr)

In [18]:
#save parameters of trained network for use by the spider brain.
#TODO fix variance scaling, otherwise,
#   the spider will need to rescale the output.
np.savez('spi_gmm_wb.npz',
         w1=d['w1'],
         w2=d['w2'],
         w3=d['w3'],
         b1=d['b1'],
         b2=d['b2'],
         b3=d['b3']
        )

In [10]:
#%%capture
print 'calc_agg_grad_w1'
print d['calc_agg_grad_w1']
print 'calc_agg_grad_b1'
print d['calc_agg_grad_b1']
print 'calc_agg_grad_w2'
print d['calc_agg_grad_w2']
print 'calc_agg_grad_b2'
print d['calc_agg_grad_b2']
print 'calc_agg_grad_w3'
print d['calc_agg_grad_w3']
print 'calc_agg_grad_b3'
print d['calc_agg_grad_b3']

calc_agg_grad_w1
[[ -2.70116379e-05  -1.98263116e-03  -7.23154168e-04  -6.89868408e-04
    2.90593540e-04   5.06936805e-04  -1.16172724e-03  -2.83261877e-04
    1.16069292e-04  -5.16005348e-05  -6.71732705e-04   6.22277090e-04
    1.38824762e-04   1.14822132e-03   7.38940493e-04   2.30848425e-04
   -3.63264146e-04  -1.46061950e-03  -1.06387008e-04  -5.44599839e-04]
 [  7.06047285e-04  -3.66355758e-04  -1.57967489e-03  -4.08612512e-04
    8.68742936e-04   1.69277319e-03  -3.59494356e-04  -9.80224926e-04
   -1.77434122e-04  -3.97690397e-04  -5.65754308e-04   3.99136334e-04
    5.76156075e-04   7.76578789e-04   1.24315755e-03   3.31460644e-04
   -7.94072403e-04  -4.16844268e-04  -6.03228807e-04  -1.18860148e-03]
 [ -4.45586105e-04   1.43642596e-04   1.22843718e-04   1.07971391e-04
    2.89406569e-04  -6.99157128e-04   6.91376394e-04  -9.70246852e-04
   -5.43955888e-04  -1.09084696e-03   9.66408174e-04  -6.71004120e-04
    2.64565577e-04  -8.54282756e-04  -1.11522200e-03  -2.84855458e-04
 

In [11]:
print d['v']

[[ 0.3015677 ]
 [ 0.2910279 ]
 [ 0.28570849]
 ..., 
 [ 0.16120334]
 [ 0.17074935]
 [ 0.16143209]]


In [12]:
print np.mean(d['m'], 0)

[ 0.09440826  0.18872203  0.23239148  0.24158886  0.24288997]


### Note on Mixing Coefficients
I have yet to see a mixing coefficient much below .1. This tells me something may be awry, and may be/is the cause of many stray points.