Alex Malz, David Mykytyn

This is a sandbox for developing an unsupervised classifier of astronomical lightcurves.

In [None]:
from collections import namedtuple
import itertools
import random
import numpy as np
import scipy.stats as sps
import scipy.optimize as spo
import matplotlib.pyplot as plt
%matplotlib inline
import corner

In [None]:
LC = namedtuple('LC', ('x', 'y'))

 may need to preprocess to keep it reasonable, constraints on delta/stretch

# Simulate some mock data

In [None]:
def make_gauss(scale, loc=0., amp=1., const=0.):
    func = sps.norm(loc, scale)
    out = lambda x: amp * func.pdf(x) + const
    return out

def make_sine(period, phase=0., amp=1., const=0.):
    func = lambda x: amp * (np.sin(period * x + phase)) + const
    return func

In [None]:
def make_cadence(x, scatter):
    assert(np.all((x[1:]-x[:-1]) > scatter))
    jitter = (np.random.uniform(np.shape(x)) - 0.5) * scatter * 2.
    perturbed = x + jitter
    return perturbed

def noisify_obs(y, scatter):
    errs = scatter * np.ones_like(y)
    new_y = y + sps.norm(0., scatter).rvs(np.shape(y))
    return(new_y, errs)

In [None]:
def_cadence = np.arange(0., 200., 5.)

gmodel = make_gauss(10., 100., 50., 1.)
gtimes = make_cadence(def_cadence, 0.5)
gphot, gerr = noisify_obs(gmodel(gtimes), 0.1)
glc = LC(gtimes, gphot)

smodel = make_sine(20., 0., 5., 5.)
stimes = make_cadence(def_cadence, 0.5)
sphot, serr = noisify_obs(smodel(stimes), 0.3)
slc = LC(stimes, sphot)

In [None]:
plt.errorbar(glc.x, glc.y, yerr=gerr, linestyle='None', marker='o')
plt.errorbar(slc.x, slc.y, yerr=serr, linestyle='None', marker='+')

## Permitted transformations

* shiftx
* stretchx
* shifty
* stretchy
* (cross-talk between bands)


In [None]:
def transform(lc, deltax, deltay, stretchx, stretchy):
    new_x = (stretchx * lc.x) + deltax
    new_y = (stretchy * lc.y) + deltay
    return LC(new_x, new_y)

def merge(lca, lcb):
#     minx, maxx = max(min(lca.x), min(lcb.x)), min(max(lca.x), max(lcb.x))
    new_x = np.concatenate((lca.x, lcb.x))
    new_y = np.concatenate((lca.y, lcb.y))
    order = np.argsort(new_x)
    ord_x = new_x[order]
    ord_y = new_y[order]
#     condition = np.where(np.logical_and(ord_x <= maxx, ord_x >= minx))
#     ord_x = ord_x[condition]
#     ord_y = ord_y[condition]
    return LC(ord_x, ord_y)

# Reduce to summary statistics (consistency metric)

Contenders:

* periodogram -- identify periodicity and stochastic noise levels
* flux per time bins -- trends keeping bin size constant but changing bin ends
* abs/percent change in color and total flux/magnitude

find MAP/MLE of p(A = B | lc_A, lc_B)
optimize over shift/stretch params

merge (x_A, x_B) and (y_A, y_B)

Regularization is going to be really hard!

connect the dots is taking an arc length

could use the gaussian error bars to get probability that new hypothesis point lies on original line?

In [None]:
def connect_the_dots(lc):
#     n_points = len(lc.x)
    x_difs = (lc.x[1:] - lc.x[:-1])
    y_difs = lc.y[1:] - lc.y[:-1]
    sol = np.sqrt(x_difs ** 2 + y_difs ** 2)
    return np.sum(sol)

In [None]:
gtimes2 = gtimes + 50. * np.ones_like(gtimes)#ake_cadence(def_cadence, 0.5)
gphot2, gerr2 = gphot, gerr#noisify_obs(gmodel(gtimes2), 0.1)
glc2 = LC(gtimes2, gphot2)
# glc2 = transform(glc, 5., 0., 1., 1.)
plt.plot(glc.x, glc.y, label='original')
plt.plot(glc2.x, glc2.y, label='shifted')
plt.plot(merge(glc, glc2).x, merge(glc, glc2).y, label='merged')
plt.legend()
print((connect_the_dots(glc), connect_the_dots(glc2), connect_the_dots(merge(glc, glc2))))

In [None]:
print((connect_the_dots(glc), connect_the_dots(glc2), connect_the_dots(merge(glc, transform(glc2, -0.5, 0., 1., 1.)))))

In [None]:
def find_max_prob(lca, lcb, ivals=(0., 0., 1., 1.)):
    
    origa = connect_the_dots(lca)
#     origb = connect_the_dots(lcb)
    xdifmax = np.max((lca.x[-1]-lca.x[0], lcb.x[-1]-lcb.x[0]))
    xdifmin = np.min((np.min(lca.x[1:]-lca.x[:-1]), np.min(lcb.x[-1]-lcb.x[0])))
    ydifmax = np.max((np.max(lca.y)-np.min(lca.y), np.max(lcb.y)-np.min(lcb.y)))
    ydifmin = np.min((np.min(lca.y[1:]-lca.y[:-1]), np.min(lcb.y[-1]-lcb.y[0])))

    def dxlim_hi(params):
        return xdifmax - params[0]
    def dxlim_lo(params):
        return params[0] - xdifmin
    def dylim_hi(params):
        return ydifmax - params[1]
    def dylim_lo(params):
        return params[1] - ydifmin
    def sxlim_hi(params):
        return xdifmax - params[2] * xdifmin
    def sxlim_lo(params):
        return params[2] * xdifmax - xdifmin
    def sylim_hi(params):
        return ydifmax - params[3] * ydifmin
    def sylim_lo(params):
        return params[3] * ydifmax - ydifmin
    def slim(params):
        return params[2:]

    constraints = [dxlim_hi, dylim_hi, dylim_lo, sxlim_hi, sxlim_lo, sylim_hi, sylim_lo, slim]
    
    debug = []
    def _helper(params):
        (deltax, deltay, stretchx, stretchy) = params
        lc = transform(lcb, deltax, deltay, stretchx, stretchy)
        new_len = connect_the_dots(lc)
        lc_both = merge(lca, lc) 
        length = connect_the_dots(lc_both)
        to_min = length - origa
        return(to_min)
    
    res = spo.fmin_cobyla(_helper, ivals, constraints)
    
    tmp = transform(lcb, res[0], res[1], res[2], res[3])
    fin = merge(lca, tmp)
    debug = connect_the_dots(fin)
    return(res, debug)

In [None]:
ans, debug = find_max_prob(glc, glc2)

In [None]:
# (dx, dy, sx, sy) = ans
# print(ans)
# fin = transform(glc2, dx, dy, sx, sy)
# print(connect_the_dots(fin))
# plt.plot(glc.x, glc.y)
# plt.plot(fin.x, fin.y)
# plt.plot(merge(glc, fin).x, merge(glc, fin).y)

In [None]:
def plot_reconstruct(lca, lcb, params, truea='', trueb=''):
    (dx, dy, sx, sy) = params
#     print(params)
    fin = transform(lcb, dx, dy, sx, sy)
    print(fin.x)
    plt.plot(lca.x, lca.y, label='reference'+truea)
    plt.plot(lcb.x, lcb.y, label='hypothetical'+trueb)
    plt.plot(fin.x, fin.y, label='transformed'+trueb)
    plt.plot(merge(lca, fin).x, merge(lca, fin).y, label='merged')
    plt.title(str(params))
    plt.legend()
    plt.show()
    plt.close()

In [None]:
plot_reconstruct(glc, glc2, ans)

# Do this many times

In [None]:
num_obj = 10
cls_models = [make_gauss, make_sine]
cls_params = [{'scale': 10., 'loc': 100., 'amp': 50., 'const': 1.}, 
              {'period': 20., 'phase': 0., 'amp': 5., 'const': 5.}]
cls_wts = None # even split for now
num_cls = len(cls_models)
# will need a way to draw model params

def_cadence = np.arange(0., 200., 5.)
lcs = []
truth = np.random.choice(range(num_cls), num_obj, p=cls_wts)
ids, inds, cts = np.unique(truth, return_counts=True, return_inverse=True)
# print(ids, cts, inds)

for i in range(num_obj):
    times = make_cadence(def_cadence, 0.5)
    model = cls_models[ids[inds[i]]](**cls_params[ids[inds[i]]])
    phot, err = noisify_obs(model(times), 0.1)
    lcs.append(LC(times, phot))
    
masks = np.zeros((num_cls, num_obj, num_obj))
for i in ids:
    which_ones = np.where(truth == i)[0]
#     print(which_ones)
    pairs = np.array(list(itertools.permutations(which_ones, 2))).T
#     print(pairs)
    masks[i, pairs[0], pairs[1]] += 1
    
# print(masks)

In [None]:
def mini_pipeline(all_lcs):
    how_many = len(all_lcs)
    indices = range(how_many)
    dump_difs = np.empty((how_many, how_many))
    dump_params = []
    
    for i in indices:
        one_set = []
        for j in indices:
            ans, fin_len = find_max_prob(all_lcs[i], all_lcs[j])
#             print(ans, fin_len)
            one_set.append(np.asarray(ans))
            dump_difs[i][j] = fin_len
        dump_params.append(one_set)
    dump_params = np.array(dump_params)
            
    return(dump_params, dump_difs)

In [None]:
all_params, all_difs = mini_pipeline(lcs)

In [None]:
# check for symmetry -- really thought these would be symmetric. . .
plt.matshow(np.sum(masks, axis=0))
layered = np.swapaxes(all_params, 0, -1)

deltafunc = lambda x: np.abs(x)
stretchfunc = lambda x: np.min(np.array([x, 1./x]).T, axis=-1)
funcs = [deltafunc, deltafunc, stretchfunc, stretchfunc]

for i in range(4):
    plt.matshow(funcs[i](layered[i]))
    plt.plot([0, num_obj-1], [0, num_obj-1], color='k')

# Cluster in the space of summary statistics

kdtree (and more)

In [None]:
global_mask = np.zeros((num_obj, num_obj))
# for i in range(4):
#     global_mask = np.logical_or(global_mask, masks[i])
for i in range(num_cls):
    global_mask = np.logical_or(global_mask, masks[i])
    plt.hist((all_difs * masks[i]).flatten(), alpha=0.25, label=str(i))
plt.hist(all_difs[~global_mask[i]].flatten(), alpha=0.25, label='no match')
plt.legend()

In [None]:
corner.corner(all_params.reshape(100, 4))

In [None]:
def listerize(data, masks):
    datashape = np.shape(data)
    global_mask = np.ma.make_mask_none(np.shape(masks)[1:])
    layers = []
    for i in range(len(masks)):# per class
        one_mask = np.ma.make_mask(masks[i])
        layer = np.ma.array(data, mask=np.ma.logical_not(one_mask)[np.newaxis])#data * masks[i][np.newaxis]
        global_mask = np.ma.mask_or(global_mask, one_mask)
        layers.append(layer.compressed())
        
    global_mask = np.ma.make_mask(global_mask)
    others = np.ma.array(data, mask=global_mask[np.newaxis]).compressed()#data * ~global_mask[np.newaxis]
    return(layers, others)

In [None]:
per_class, mismatch = listerize(all_difs, masks)

In [None]:
def density_estimation(m1, m2):
    X, Y = np.mgrid[min(m1):max(m1):100j, min(m2):max(m2):100j]                                                     
    positions = np.vstack([X.ravel(), Y.ravel()])                                                       
    values = np.vstack([m1, m2])                                                                        
    kernel = sps.gaussian_kde(values)                                                             
    Z = np.reshape(kernel(positions).T, X.shape)
    return X, Y, Z

def mycorner(data, keys, colors, maps, lims=None, pre_densities=None, filename='plot.pdf'):
    ncol = len(keys)
    fig = plt.figure(figsize=(ncol*5, ncol*5))
    ax = [[fig.add_subplot(ncol, ncol, ncol * i + j + 1) for j in range(i+1)] for i in range(ncol)]
#     print(len(data), len(colors))
    for k in range(len(data)):
        datum = data[k]
        npoints = len(datum)
        for i in range(ncol):
            for j in range(i+1):
                if i == j:
#                     print(datum[keys[i]])
                    ax[i][j].hist(datum[i].data, histtype='step', linewidth=2, alpha=0.5, color=colors[k])
                    ax[i][j].set_xlabel(keys[i])
                else:
#                     if (npoints >= 1e4 or npoints <= 100):
                    ax[i][j].scatter(datum[i].data, datum[j].data, color=colors[k], alpha=0.5)
#                     else:
#                         if pre_densities is None:
#                             x, y, z = density_estimation(datum[keys[i]], datum[keys[j]])
#                         else:
#                             (x, y, z) = pre_densities[i][j]
#                         ax[i][j].contour(x, y, z, cmap=plt.get_cmap(maps[k]) , alpha=0.5)
                    ax[i][j].set_xlabel(keys[i])
                    ax[i][j].set_ylabel(keys[j])
#                     if lims is not None:
#                         ax[i][j].set_xlim(lims)
#                         ax[i][j].set_ylim(lims)
#     fig.savefig(filename, dpi=100)
    return#(fig)
# replace with 2d histogram for speed

In [None]:
mycorner([per_class[0], per_class[1], mismatch], ['deltax', 'deltay', 'stretchx', 'stretchy'], ['r', 'g', 'b'], ['Reds', 'Greens', 'Blues'])

In [None]:
for i in range(num_obj):
    for j in range(num_obj):
        plot_reconstruct(lcs[i], lcs[j], all_params[i][j], truea=str(truth[i]), trueb=str(truth[j]))

# Other ideas

pairwise combinations/comparisons?