In [None]:
import pathlib

import astropy.coordinates as coord
import astropy.table as at
import astropy.units as u
import corner
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from tqdm import tqdm, trange

from scipy.spatial import cKDTree
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import KernelDensity

from schlummernd.data import load_data, Features, Labels
from schlummernd.plot import colored_corner

In [None]:
# plot_path = conf.plot_path / 'neighborhoods'
# plot_path.mkdir(exist_ok=True)

In [None]:
# Random number generator, using seed from config file
rng = np.random.default_rng(seed=42)

### Select a subset of stars to use for defining the neighborhoods:

In [None]:
g_all = load_data(
    filters=dict(
        TEFF=(2500, 10000),
        LOGG=(-1, 6),
        M_H=(-3, None)
    )
)
g_all = g_all[np.unique(g_all.APOGEE_ID, return_index=True)[1]]
print(len(g_all))

g = g_all.filter(SNR=(200, None), ruwe=(None, 1.2))
# TODO: this number (size) should be configurable
idx = rng.choice(len(g), size=16384, replace=False)
g = g[idx]
print(len(g))

In [None]:
# Spectroscopic HR diagram of the subset stars:
fig, ax = plt.subplots(figsize=(6, 6))

teff_logg_bins = (
    np.linspace(3000, 9000, 128),
    np.linspace(-0.5, 5.75, 128))
ax.hist2d(g_all.TEFF,
          g_all.LOGG,
          bins=teff_logg_bins,
          norm=mpl.colors.LogNorm(),
          cmap='Greys')

ax.plot(g.TEFF,
        g.LOGG,
        ls='none', marker='o', mew=0, ms=2.,
        color='tab:blue', alpha=0.75)

ax.set_xlim(teff_logg_bins[0].max(),
            teff_logg_bins[0].min())
ax.set_ylim(teff_logg_bins[1].max(),
            teff_logg_bins[1].min())

ax.set_xlabel('TEFF')
ax.set_ylabel('LOGG')

fig.tight_layout()
# fig.savefig(plot_path / 'subset-logg-teff.png', dpi=200)

In [None]:
other_features = {
    r"$G_{\rm BP}-G_{\rm RP}$": 0.1 * (g_all.phot_bp_mean_mag - g_all.phot_rp_mean_mag).value,
    r"$G_{\rm BP}-H$": 0.1 * (g_all.phot_bp_mean_mag.value - g_all.H)
}

f_all = Features.from_gaiadata(g_all, n_bp=5, n_rp=5)
f = Features.from_gaiadata(g, n_bp=5, n_rp=5)
# f = f_all[idx]

In [None]:
# tmp = parent.mask_spec_pixels(global_spec_mask)
# parent_X = tmp.get_neighborhood_X(conf.neighborhood_color_names)

In [None]:
_ = colored_corner(f.X[:, :3], scatter=False, statistic='count')

In [None]:
plt.plot(
    # g.bp[:, 1]/g.bp[:, 0],
    f.X[:, 0],
    g.TEFF,
    ls='none'
)

In [None]:
_ = colored_corner(f.X[:, :5], scatter=False, color_by=g.TEFF, add_colorbar=True) # , vmin=4000, vmax=6000)

In [None]:
_ = corner.corner(
    f.X, 
    range=[np.nanpercentile(x, [5, 95]) for x in f.X.T],
    plot_density=False,
    plot_contours=False, 
    alpha=0.5
)

Run PCA on the neighborhood node features and project the subset feature matrix onto the PCA basis:

In [None]:
# TODO: magic number
neighborhood_pca_components = 6

In [None]:
pca = IncrementalPCA(
    n_components=neighborhood_pca_components,
    batch_size=1024
)  # TODO: magic number
projected_X = pca.fit_transform(f.X)
projected_X /= pca.singular_values_

This hacky step removes extreme outliers - only run this after the cells above because it overwrites variables!!

In [None]:
mean = np.mean(projected_X, axis=0)
std = np.std(projected_X, axis=0)
bad_mask = np.any(np.abs(projected_X - mean) > 5*std, axis=1)

neighborhood_node_X = f.X[~bad_mask]
neighborhood_node_g = g[~bad_mask]

pca = IncrementalPCA(
    n_components=neighborhood_pca_components,
    batch_size=1024
)
node_projected_X = pca.fit_transform(neighborhood_node_X)
node_projected_X /= pca.singular_values_

In [None]:
fig, axes = plt.subplots(pca.n_components_ // 2, 2,
                         figsize=(16, 12), sharex=True)

for i, ax in enumerate(axes.flat):
    ax.plot(pca.components_[i])
    
fig.tight_layout()

In [None]:
print(np.cumsum(pca.explained_variance_ratio_)[-1])
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.ylim(0.8, 1)
plt.axhline(0.9, zorder=-10, alpha=0.5, color='tab:blue')

### Plot the spectroscopic parameters, colored by PCA component

In [None]:
fig, axes = plt.subplots(3, 3, 
                         figsize=(10, 10),
                         sharex=True, sharey=True)

for i in range(pca.n_components):
    ax = axes.flat[i]
    ax.scatter(
        neighborhood_node_g.TEFF,
        neighborhood_node_g.LOGG,
        c=node_projected_X[:, i], s=6
    )
    ax.text(teff_logg_bins[0].max() - 100, 
            teff_logg_bins[1].min() + 0.1,
            f'PCA feature {i}', va='top', ha='left')

for i in range(pca.n_components, len(axes.flat)):
    axes.flat[i].set_visible(False)
    
ax.set_xlim(teff_logg_bins[0].max(), 
            teff_logg_bins[0].min())
ax.set_ylim(teff_logg_bins[1].max(), 
            teff_logg_bins[1].min())

fig.tight_layout()
# fig.savefig(plot_path / 'neighborhood-logg-teff-pca.png', dpi=200)

In [None]:
def simple_corner(X, labels=None, color_by=None, axes=None,
                  colorbar=False, **style):
    if X.shape[1] > X.shape[0]:
        raise ValueError("I don't believe you")

    if color_by is None:
        plotfunc = 'plot'
        style.setdefault('marker', 'o')
        style.setdefault('mew', style.pop('markeredgewidth', 0))
        style.setdefault('ls', style.pop('linestyle', 'none'))
        style.setdefault('ms', style.pop('markersize', 2.))
    else:
        plotfunc = 'scatter'
        style.setdefault('marker', 'o')
        style.setdefault('lw', style.pop('linewidth', 0))
        style.setdefault('s', 5)
        style.setdefault('c', color_by)

    nside = X.shape[1] - 1

    # Some magic numbers for pretty axis layout.
    # Stolen from corner.py!
    K = X.shape[1]
    factor = 2.0  # size of one side of one panel
    lbdim = 0.5 * factor  # size of left/bottom margin
    trdim = 0.2 * factor  # size of top/right margin
    whspace = 0.05  # w/hspace size
    plotdim = factor * K + factor * (K - 1.0) * whspace
    dim = lbdim + plotdim + trdim

    if axes is None:
        fig, axes = plt.subplots(nside, nside,
                                 figsize=(dim, dim),  # (3*nside, 3*nside),
                                 sharex='col', sharey='row',
                                 constrained_layout=True)
    else:
        fig = axes.flat[0].figure

    if not isinstance(axes, np.ndarray):
        axes = np.array([[axes]])

    cs = None
    for i in range(nside):
        for j in range(nside):
            ax = axes[i, j]
            if i < j:
                ax.set_visible(False)
            else:
                cs = getattr(ax, plotfunc)(X[:, j], X[:, i+1], **style)

    if labels is not None:
        for i in range(nside):
            axes[i, 0].set_ylabel(labels[i+1])

        for j in range(nside):
            axes[-1, j].set_xlabel(labels[j])

    return_stuff = [fig, axes]

    if colorbar and color_by is not None and cs is not None:
        cb = fig.colorbar(cs, ax=axes)
        return_stuff.append(cb)

    return return_stuff


In [None]:
things = {
    'TEFF': (3000, 6500),
    'LOGG': (0.5, 5.5),
    'M_H': (-2, 0.5),
    'AK_WISE': (0, 1)
}
for name, (vmin, vmax) in things.items():
    fig, axes, cb = simple_corner(
        node_projected_X, 
        color_by=neighborhood_node_g[name],
        colorbar=True,
        vmin=vmin, vmax=vmax,
        labels=[f'PCA {i}' 
                for i in range(pca.n_components_)])
    cb.ax.set_aspect(40)
    axes.flat[0].set_title(f'color: {name}')
    
    # fig.savefig(plot_path / f'neighborhood-pca-{name}.png', dpi=200)
    # plt.close(fig)

### Now use the sample to define the neighborhoods:

We do this by estimating the local density (in projected feature space)

In [None]:
parent_X = f_all.X

In [None]:
bw = np.mean(np.std(node_projected_X, axis=0)) * np.sqrt(node_projected_X.shape[1]) / 3
kde = KernelDensity(bandwidth=bw, kernel='epanechnikov')
_ = kde.fit(node_projected_X)

In [None]:
parent_projected_X = np.zeros((parent_X.shape[0], node_projected_X.shape[1]),
                               dtype=np.float32)

vals = np.linspace(0, parent_X.shape[0], 32).astype(int)
for i1, i2 in zip(vals[:-1], vals[1:]):
    if i2 >= parent_X.shape[0]-1:
        i2 = parent_X.shape[0]
        
    parent_projected_X[i1:i2] = pca.transform(parent_X[i1:i2])
    parent_projected_X[i1:i2] /= pca.singular_values_

In [None]:
training_projected_X = parent_projected_X
training_dens = kde.score_samples(training_projected_X)
assert training_projected_X.shape[0] == training_dens.shape[0]

In [None]:
training_sample_idx = np.arange(parent_projected_X.shape[0])

In [None]:
min_neighborhood_size = 256
max_neighborhood_size = 4096

sort_idx = training_dens.argsort()[::-1]
tree = cKDTree(training_projected_X)

neighborhoods = []
all_block_stars = np.array([], dtype=np.int32)
for i in tqdm(sort_idx):
    # Skip if a star already appears in a block
    stoop_idx = i
    if stoop_idx in all_block_stars:
        continue
    
    _, results = tree.query(
        training_projected_X[i], 
        k=max_neighborhood_size
    )
    
    neighborhood_idx = training_sample_idx[results]
    neighborhoods.append(neighborhood_idx)
    
    all_block_stars = np.concatenate((
        all_block_stars, 
        neighborhood_idx[:max_neighborhood_size].astype(np.int32)
    ))

len(neighborhoods)

In [None]:
stoop_idx = np.array([idx[0] for idx in neighborhoods])
stoop_projected_X = parent_projected_X[stoop_idx]
stoop_g = g_all[stoop_idx]
assert len(stoop_g['APOGEE_ID']) == np.unique(stoop_g['APOGEE_ID']).size

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

ax.scatter(stoop_g['TEFF'], stoop_g['LOGG'])

ax.set_xlim(teff_logg_bins[0].max(), 
            teff_logg_bins[0].min())
ax.set_ylim(teff_logg_bins[1].max(), 
            teff_logg_bins[1].min())

ax.set_title('stoops')

ax.set_xlabel('TEFF')
ax.set_ylabel('LOGG')

fig.tight_layout()

This makes plots of all of the neighborhood stars:

In [None]:
# neighbor_plot_path = plot_path / 'neighborhoods'
# neighbor_plot_path.mkdir(exist_ok=True)

# for name in tqdm(neighbor_plot_path.rglob('*.png')):
#     if name.exists():
#         name.unlink()

lims = {
    'TEFF': (8000, 3200), 
    'LOGG': (5.5, -0.5), 
    'M_H': (-2.5, 0.5), 
    'AK_WISE': (0, 2)
}
for n, hood in enumerate(tqdm(neighborhoods[:10])):
    stoop = g_all[hood[0]]
    block = g_all[hood[1:]]
    
    fig, axes = plt.subplots(
        1, 3, 
        figsize=(15, 5), 
        constrained_layout=True
    )
    
    for ax, names in zip(axes, [('TEFF', 'LOGG'), 
                                ('TEFF', 'M_H'), 
                                ('M_H', 'AK_WISE')]):
        ax.scatter(stoop[names[0]], stoop[names[1]],
                   zorder=100, color='tab:blue', s=14)
        ax.scatter(block[names[0]], block[names[1]],
                   s=3, color='tab:orange', alpha=0.5, zorder=1)
        
        ax.set_xlabel(names[0])
        ax.set_ylabel(names[1])
        
        ax.set_xlim(lims[names[0]])
        ax.set_ylim(lims[names[1]])
    
#     fig = plot_hr_cmd(parent.stars, parent.stars,
#                       idx0=hood[0], other_idx=hood[1:])
    if n > 10: break
#     fig.tight_layout()
#     fig.savefig(neighbor_plot_path / f'neighborhood-{n:03d}.png', dpi=200)
#     plt.close(fig)

In [None]:
_rng = np.random.default_rng(42)
for n, n_idx in enumerate(tqdm(_rng.choice(len(neighborhoods), size=10, replace=False))):
    hood = neighborhoods[n_idx]
    
    stoop = g_all[hood[0]]
    block = g_all[hood[1:]]
    
    fig, axes = plt.subplots(
        1, 3, 
        figsize=(15, 5), 
        constrained_layout=True
    )
    
    for ax, names in zip(axes, [('TEFF', 'LOGG'), 
                                ('TEFF', 'M_H'), 
                                ('M_H', 'AK_WISE')]):
        ax.scatter(stoop[names[0]], stoop[names[1]],
                   zorder=100, color='tab:blue', s=14)
        ax.scatter(block[names[0]], block[names[1]],
                   s=3, color='tab:orange', alpha=0.5, zorder=1)
        
        ax.set_xlabel(names[0])
        ax.set_ylabel(names[1])
        
        ax.set_xlim(lims[names[0]])
        ax.set_ylim(lims[names[1]])
    
#     fig = plot_hr_cmd(parent.stars, parent.stars,
#                       idx0=hood[0], other_idx=hood[1:])
    if n > 10: break
#     fig.tight_layout()
#     fig.savefig(neighbor_plot_path / f'neighborhood-{n:03d}.png', dpi=200)
#     plt.close(fig)

### Apply to the full parent sample

Every parent sample star should get a stoop

In [None]:
# parent_tree = cKDTree(parent_projected_X)
stoop_tree = cKDTree(stoop_projected_X)

In [None]:
# _, closest_stoop_idx = parent_tree.query(stoop_projected_X, k=2)
closest_stoop_dist, closest_stoop_idx = stoop_tree.query(parent_projected_X, k=1)
closest_stoop_idx = closest_stoop_idx.ravel()
closest_stoop_dist = closest_stoop_dist.ravel()

In [None]:
len(stoop_projected_X), np.unique(closest_stoop_idx).size

In [None]:
all_indices = []
for n in range(len(neighborhoods)):
    all_indices.append(neighborhoods[n][:conf.block_size])
tmp = np.unique(np.ravel(all_indices)).shape[0] / training_projected_X.shape[0]
print(f"{tmp*100:.1f}% of training stars end up in a block")

tmp = np.unique(np.ravel(all_indices)).shape[0] / parent_projected_X.shape[0]
print(f"{tmp*100:.1f}% of parent stars end up in a block")

In [None]:
for size in 2 ** np.arange(12, 14+1):
    all_indices = []
    for n in range(len(neighborhoods)):
        all_indices.append(neighborhoods[n][:size])
    tmp = np.unique(np.ravel(all_indices)).shape[0] / training_projected_X.shape[0]
    print(f"{tmp*100:.1f}% of training stars end up in a neighborhood of size {size}")

In [None]:
len(parent), parent_projected_X.shape

In [None]:
np.save(conf.neighborhood_index_file, 
        np.array(neighborhoods))

In [None]:
np.savez(conf.parent_closest_stoop_file, 
         idx=closest_stoop_idx, dist=closest_stoop_dist)