In [1]:
import numpy as np
import matplotlib.pyplot as plt
from astropy.table import Table
from glob import glob

import stan_utils as stan
from mpl_utils import (mpl_style, common_limits)

plt.style.use(mpl_style)

np.random.seed(42)

%matplotlib inline

In [2]:
survey_data_paths = glob("data/*.csv")
survey_data = [Table.read(dp) for dp in survey_data_paths]

In [3]:
for dp, d in zip(survey_data_paths, survey_data):
    print(dp, d.dtype.names)

data/Adibekyan12-all-renorm.csv ('HIP', 'Star', 'Teff', 'logg', 'FeH', 'NaH', 'MgH', 'AlH', 'SiH', 'CaH', 'ScIH', 'ScIIH', 'TiIH', 'TiIIH', 'VH', 'CrIH', 'CrIIH', 'MnH', 'CoH', 'NiH')
data/Bensby14-all-renorm.csv ('HIP', 'Teff', 'logg', 'FeH', 'OH', 'NaH', 'MgH', 'AlH', 'SiH', 'CaH', 'TiH', 'CrH', 'NiH', 'ZnH', 'YH', 'BaH')
data/Valenti05-all-renorm.csv ('HIP', 'HD', 'Teff', 'logg', 'NaH', 'SiH', 'TiH', 'FeH', 'NiH')


In [4]:
label_identifier = "HIP"
label_names = ("Teff", "logg", "FeH", "MgH", "SiH", "NaH")

In [8]:
# generate the data arrays
unique_hip_names = np.sort(np.unique(np.hstack([d["HIP"] for d in survey_data])))
N = unique_hip_names.size
M = len(survey_data)
D = len(label_names)

y = np.nan * np.ones((N, M, D), dtype=float)

In [17]:
for m, data in enumerate(survey_data):
    for i, star in enumerate(data):
        n = np.where(star[label_identifier] == unique_hip_names)[0][0]
        y[n, m, :] = np.array([(star[ln] if ln in star.dtype.names else np.nan) \
                               for ln in label_names])
        

In [18]:
# set bad abundances as nans
for d, label_name in enumerate(label_names):
    if label_name not in ("Teff", "logg"):
        y[:, :, d][(y[:, :, d] >= 90)] = np.nan


In [21]:
# checks
for d, label_name in enumerate(label_names):
    _ = y[:, :, d]
    print("{0}: ({1:.1f} to {2:.1f}), mean/median/std: {3:.1f}/{4:.1f}/{5:.1f} ({6:.0f} finite)".format(
          label_name, np.nanmin(_), np.nanmax(_), 
          np.nanmean(_), np.nanmedian(_), np.nanstd(_),
          np.sum(np.isfinite(_))))

Teff: (4556.0 to 7212.0), mean/median/std: 5667.9/5726.0/423.6 (2718 finite)
logg: (2.7 to 5.1), mean/median/std: 4.3/4.4/0.2 (2718 finite)
FeH: (-2.6 to 0.6), mean/median/std: -0.1/-0.1/0.3 (2718 finite)
MgH: (-1.4 to 0.6), mean/median/std: 0.0/0.0/0.2 (1715 finite)
SiH: (-1.6 to 0.7), mean/median/std: -0.0/0.0/0.2 (2714 finite)
NaH: (-2.0 to 1.0), mean/median/std: -0.0/0.0/0.3 (2710 finite)


In [59]:
# Create an additive variance array to account for missing data.
def de_gapify_data(y, additional_variance=1e8):
    """
    Fill NaNs in the label vectors with the mean values of other
    labels. No structure is assumed for the missing labels (e.g.,
    some labels could be missing from some surveys, but not all).
    
    :param y:
        The label vector, where NaNs represent missing data.
    
    :param additional_variance: [optional]
        The variance to add for missing data.
    """
    
    N, M, D = y.shape
    missing = ~np.isfinite(y)
    mean_stellar_labels = np.nanmean(y, axis=1)
    mean_labels = np.nanmean(y.reshape((-1, D)), axis=0)

    for d in range(D):
        mean_stellar_labels[:, d][~np.isfinite(mean_stellar_labels[:, d])] = mean_labels[d]

    y_full_rank = np.copy(y)
    variance = np.zeros_like(y)
    for m in range(M):
        y_full_rank[:, m, :][missing[:, m, :]] = mean_stellar_labels[missing[:, m, :]]
        variance[:, m, :][missing[:, m, :]] = 1.0
        assert np.all(np.isfinite(y_full_rank[:, m, :]))

    variance *= additional_variance

    assert np.all(np.isfinite(y_full_rank))
    return (y_full_rank, variance)

y_full_rank, extra_variance = de_gapify_data(y)



In [24]:
model = stan.read_model("model-missing-data.stan")
print(model.model_code)

INFO:root:Using pre-compiled model from model-missing-data.stan.cached



/*
  Latent factor model for chemical abundances from multiple studies, allowing
  for missing data.
*/

data {
  int<lower=1> N; // number of stars
  int<lower=1> D; // dimensionality of the data (number of labels)
  int<lower=1> M; // number of surveys (or studies)
  vector[D] y[N, M]; // the labels as reported by various surveys.
  vector<lower=0>[D] scales; // fixed relative scales for latent factors
  vector[D] y_additive_variance[N, M]; // variance to add for missing data.
}

transformed data {
  vector[D] mu; // the mean of the data in each dimension
  int<lower=1> Q; // the number of non-zero lower-triangular entries that we
                  // need for the decomposoition of our theta matrix
  Q = M * choose(D, 2);

  // TODO: Stop assuming that the user is not an idiot
  mu = rep_vector(0.0, D);
}

parameters {
  vector[D] X[N]; // latent factors for each star
  vector<lower=0>[M] phi[D]; // variance on survey labels

  vector[Q] L_lower_triangular; // lower triangular entri

In [None]:
# Optimize the model.
mu = np.mean(y_full_rank.reshape((-1, D)), axis=0)
scales = np.std(y_full_rank.reshape((-1, D)), axis=0)

data = dict(N=N, M=M, D=D, scales=scales, 
            y=y_full_rank - mu, y_additive_variance=extra_variance)

op_kwds = dict(
    data=data, 
    iter=100000, 
    tol_obj=7./3 - 4./3 - 1, # machine precision
    tol_grad=7./3 - 4./3 - 1, # machine precision
    tol_rel_grad=1e3,
    tol_rel_obj=1e4
)

p_opt = model.optimizing(**op_kwds)

In [None]:
# NFI whether we have optimized to something sensible or not yet,.....
# "an exercise for the reader"