In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# use tex
plt.rc("text", usetex=True)

In [None]:

c_df = pd.read_csv('data/c.csv', comment='#')
c_df

In [None]:
rho_df = pd.read_csv('data/rho.csv', comment='#')
rho_df

In [None]:
au_df = pd.read_csv('data/au.csv', comment='#')
au_df

In [None]:
datasets = {
    'rho': rho_df,
    'c': c_df,
    'au': au_df,
}
truths = {
    'rho': 5.513,
    'c': 299792.458,
    'au': 149597870700,
}
yscales = {
    'rho': 'symlog',
    'c': 'symlog',
    'au': 'symlog',
}
linthresh = {
    'rho': 0.01,
    'c': 0.1,
    'au': 1,
}

nice_names = {
    'c': 'Speed of light',
    'rho': 'Density of Earth',
    'au': 'Astronomical Unit',
}

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(10, 5), sharey=True, gridspec_kw={'wspace': 0.05})
xlabels = {
    'rho': r'Difference from true value $[\mathrm{g/cm^3}]$',
    'c': r'Difference from true value $[\mathrm{km/s}]$',
    'au': r'Difference from true value $[\mathrm{km}]$',
}
historical_keys = ['rho', 'c', 'au']
for i, name in enumerate(historical_keys):
    ax = axs[i]

    values = datasets[name].value - truths[name]
    if name == 'au':
        values = values / 1000
    dates = datasets[name].year

    ax.plot(values, dates, '.', color='black')
    ax.axvline(0, color='black', linestyle='--', linewidth=1)
    # reverse y axis
    ax.invert_yaxis()
    
    if yscales[name] == 'symlog':
        print(name)
        ax.set_xscale('symlog', linthresh=linthresh[name])
        # skip every other tick
        n_ticklabels = len(ax.xaxis.get_ticklabels())
        for n, label in enumerate(ax.xaxis.get_ticklabels()):
            if n % 2 != 0 and label.get_text() != '$\\mathdefault{0}$':
                label.set_visible(False)
        ax.tick_params(axis='both', which='both', direction='in', top=True, right=True)
    ax.set_xlabel(xlabels[name])
    ax.set_ylim(2000, 1650)
    ax.set_title(nice_names[name])
    # make top x limit and bottom x limit equal
    xlim = max(abs(ax.get_xlim()[0]), abs(ax.get_xlim()[1]))
    ax.set_xlim(-xlim, xlim)
axs[0].set_ylabel('Year')

plt.tight_layout()
plt.savefig('figs/historical.pdf', bbox_inches='tight')
plt.show()

In [None]:
# get error on ratio calculations
def clarke_ratio(A, a, B, b, C=100, c=0):
    # if A:B is the ratio, we want x where A:B = C:x
    value = (B / A) * C
    uncertainty = np.sqrt((B*C*a/A)**2 + (C*b)**2 + (B*c)**2)/A
    return value, uncertainty
def clarke_quotient(A, a, B, b):
    # get the ratio B:A and its error
    value = B/A
    uncertainity = np.sqrt((B*a/A)**2+b**2)/A
    return value, uncertainity

In [None]:
ho_df = pd.read_csv('data/clarke/H-O-mass.csv', comment='#')
ho_df['uncertainty'] = ho_df['proberr'] / 0.6745
agcl_df = pd.read_csv('data/clarke/Ag-Cl-mass.csv', comment='#')
agcl_df['uncertainty'] = agcl_df['proberr'] / 0.6745
agi_df = pd.read_csv('data/clarke/Ag-I-mass.csv', comment='#')
agi_df['uncertainty'] = agi_df['proberr'] / 0.6745
agbr_df = pd.read_csv('data/clarke/Ag-Br-mass.csv', comment='#')
agbr_df['uncertainty'] = agbr_df['proberr'] / 0.6745
no_df = pd.read_csv('data/clarke/N-mass.csv', comment='#')
no_df['uncertainty'] = no_df['proberr'] / 0.6745
co_df = pd.read_csv('data/clarke/C-mass.csv', comment='#')
co_df['uncertainty'] = co_df['proberr'] / 0.6745

datasets['ho'] = ho_df
datasets['agcl'] = agcl_df
datasets['agi'] = agi_df
datasets['agbr'] = agbr_df
# datasets['no'] = no_df
# datasets['co'] = co_df

truths['agcl'] = clarke_ratio(107.8682, 0.0002, 35.453, 0.004, 100, 0)
truths['agbr'] = clarke_ratio(107.8682, 0.0002, 79.904, 0.003, 100, 0)
truths['agi'] = clarke_ratio(107.8682, 0.0002, 126.90447, 0.00003, 100, 0)
truths['ho'] = clarke_quotient(1.0080, 0.0002, 15.9995, 0.0005)
# truths['no'] = clarke_ratio(15.9995, 0.0005, 14.007, 0.001, 16, 0)
# truths['co'] = clarke_ratio(15.9995, 0.0005, 12.011, 0.002, 16, 0)

nice_names['ho'] = 'O:H mass ratio'
nice_names['agcl'] = 'Ag:Cl mass ratio'
nice_names['agi'] = 'Ag:I mass ratio'
nice_names['agbr'] = 'Ag:Br mass ratio'
# nice_names['no'] = 'N mass (O=16)'
# nice_names['co'] = 'C mass (O=16)'

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 3))
from methods import birge, random_effects_hksj, binomial_method

plot_est = False
chemistry_keys = ['ho', 'agcl', 'agi', 'agbr']

for i, name in enumerate(chemistry_keys):
    ax = axs.flatten()[i]
    # values = np.array(datasets[name].value - truths[name])
    truth = truths[name][0]
    err = truths[name][1]
    ax.axvspan(truth-err, truth+err, color='black', alpha=0.3)

    values = np.array(datasets[name].value)
    errs = np.array(datasets[name].uncertainty)
    # sort by decreasing error
    sort_idx = np.argsort(errs)[::-1]
    values = values[sort_idx]
    errs = errs[sort_idx]
    
    ax.errorbar(values, np.arange(len(values)), xerr=errs, fmt='.', markersize=2, linewidth=1, color='black')
    ax.invert_yaxis()
    # xlim = max(abs(ax.get_xlim()[0]), abs(ax.get_xlim()[1]))
    # ax.set_xlim(-xlim, xlim)
    if plot_est:
        interval_birge, _, _, _ = birge(values, errs, coverage=0.6827)
        ax.axvline(interval_birge[0], color='red', linestyle='--', linewidth=1)
        ax.axvline(interval_birge[1], color='red', linestyle='--', linewidth=1)
        interval_re, _, _, _ = random_effects_hksj(values, errs, coverage=0.6827)
        ax.axvline(interval_re[0], color='blue', linestyle='--', linewidth=1)
        ax.axvline(interval_re[1], color='blue', linestyle='--', linewidth=1)
        binomial_lower, _ = binomial_method(values, p=0.5, target=0.15865, which='lower')
        ax.axvline(binomial_lower, color='green', linestyle='--', linewidth=1)
        binomial_upper, _ = binomial_method(values, p=0.5, target=0.15865, which='upper')
        ax.axvline(binomial_upper, color='green', linestyle='--', linewidth=1)

    ax.set_title(nice_names[name])

    # ax.axvline(truth, color='black', linestyle='--', linewidth=1)
    # ax.axvline(truth-err, color='grey', linestyle='--', linewidth=1)
    # ax.axvline(truth+err, color='grey', linestyle='--', linewidth=1)
    
    # ax.set_xlabel(xlabels[name])
    # remove y ticks
    ax.set_yticks([])
    ax.get_xaxis().get_major_formatter().set_useOffset(False)
    xmin = ax.get_xlim()[0]
    xmax = ax.get_xlim()[1]
    ax.set_xlim(min(xmin, truth-err), max(xmax, truth+err))
    # add top ticks
    ax.tick_params(axis='x', top=True)
    # make ticks point inwards
    ax.tick_params(direction='in')
plt.tight_layout()
plt.savefig('figs/chemical.pdf', bbox_inches='tight')
plt.show()

In [None]:
import os
particle_keys = sorted([f.split('.')[0] for f in os.listdir('data/pdg1970') if f.endswith('.csv')])
print(particle_keys)
units = {}
for p in particle_keys:
    path = f'data/pdg1970/{p}.csv'
    datasets[p] = pd.read_csv(path, comment='#')
    with open(path, 'r') as f:
        lines = f.readlines()
        nice_names[p] = lines[0].strip('# ').strip('\n')
        units[p] = lines[1].strip('# ').strip('\n')
        value = float(lines[2].split(':')[1].strip())
        sigma = float(lines[3].split(':')[1].strip())
        truths[p] = (value, sigma)

In [None]:
print(len(particle_keys))
fig, axs = plt.subplots(6, 3, figsize=(8, 8))
from methods import birge, random_effects_hksj, binomial_method
assert len(particle_keys) <= len(axs.flatten())
plot_est = False
# particle_keys = ['lambda-lifetime', 'sigma+-lifetime', 'pion-mass-diff', 'charged-kaon-lifetime', 'charged-pion-lifetime', 'eta-mass']

for i, name in enumerate(particle_keys):
    ax = axs.flatten()[i]
    # values = np.array(datasets[name].value - truths[name])
    truth = truths[name][0]
    err = truths[name][1]
    ax.axvspan(truth-err, truth+err, color='black', alpha=0.3)

    values = np.array(datasets[name].value)
    errs = np.array(datasets[name].uncertainty)
    # sort by decreasing error
    sort_idx = np.argsort(datasets[name].year)
    values = values[sort_idx]
    errs = errs[sort_idx]
    
    ax.errorbar(values, np.arange(len(values)), xerr=errs, fmt='.', markersize=2, linewidth=1, color='black')
    ax.invert_yaxis()
    # xlim = max(abs(ax.get_xlim()[0]), abs(ax.get_xlim()[1]))
    # ax.set_xlim(-xlim, xlim)
    if plot_est:
        interval_birge, _, _, _ = birge(values, errs, coverage=0.6827)
        ax.axvline(interval_birge[0], color='red', linestyle='--', linewidth=1)
        ax.axvline(interval_birge[1], color='red', linestyle='--', linewidth=1)
        interval_re, _, _, _ = random_effects_hksj(values, errs, coverage=0.6827)
        ax.axvline(interval_re[0], color='blue', linestyle='--', linewidth=1)
        ax.axvline(interval_re[1], color='blue', linestyle='--', linewidth=1)
        binomial_lower, _ = binomial_method(values, p=0.5, target=0.15865, which='lower')
        ax.axvline(binomial_lower, color='green', linestyle='--', linewidth=1)
        binomial_upper, _ = binomial_method(values, p=0.5, target=0.15865, which='upper')
        ax.axvline(binomial_upper, color='green', linestyle='--', linewidth=1)

    ax.set_title(f'{nice_names[name]} {units[name]}')

    # ax.axvline(truth, color='black', linestyle='--', linewidth=1)
    # ax.axvline(truth-err, color='grey', linestyle='--', linewidth=1)
    # ax.axvline(truth+err, color='grey', linestyle='--', linewidth=1)
    
    # ax.set_xlabel(xlabels[name])
    # remove y ticks
    ax.set_yticks([])
    ax.get_xaxis().get_major_formatter().set_useOffset(False)
    xmin = ax.get_xlim()[0]
    xmax = ax.get_xlim()[1]
    ax.set_xlim(min(xmin, truth-err), max(xmax, truth+err))
    # add top ticks
    ax.tick_params(axis='x', top=True)
    # make ticks point inwards
    ax.tick_params(direction='in')

# remove unused axes
for ax in axs.flatten()[len(particle_keys):]:
    ax.set_visible(False)

plt.tight_layout()
plt.savefig('figs/particles.pdf', bbox_inches='tight')
plt.show()

In [None]:
from scipy.stats import binomtest
from methods import sign_rank_test

names = list(datasets.keys())

results = {}

def format_number(x):
    if x == -np.inf:
        return r'$\approx-\infty$'
    if x == np.inf:
        return r'$\approx\infty$'
    if x >= 1e2 or x <= -1e2:
        return f'${str(int(x))}$'
    return r'$\num{{{0:.2g}}}$'.format(x)

def fmt_result(result):
    # result is a list
    minmax = [min(result), max(result)]
    result_fmt = [format_number(x) for x in minmax]
    # get unique values (preserving order)
    result_fmt = pd.unique(np.array(result_fmt))
    if len(result_fmt) == 1:
        return result_fmt[0]
    else:
        return f'[{", ".join(result_fmt)}]'

for n in names:
    results[n] = {}
    results[n]['count'] = fmt_result([len(datasets[n])])

    truth_vals = []
    if hasattr(truths[n], '__iter__'):
        truth_vals.append(truths[n][0]-truths[n][1])
        truth_vals.append(truths[n][0]+truths[n][1])
        truth_vals.append(truths[n][0])
    else:
        truth_vals.append(truths[n])
    
    results[n]['num_over'] = fmt_result([np.sum(datasets[n].value > t) for t in truth_vals])
    results[n]['prop_over'] = fmt_result([np.sum(datasets[n].value > t) / len(datasets[n]) for t in truth_vals])

    # binomial test
    results[n]['binom_p_value'] = fmt_result([binomtest(np.sum(datasets[n].value > t), len(datasets[n]), p=0.5, alternative='two-sided').pvalue for t in truth_vals])

    results[n]['sign_rank_p_value'] = fmt_result([sign_rank_test(datasets[n].value, t) for t in truth_vals])

# put all the results in a pandas dataframe
results_df = pd.DataFrame(results).T
results_df.index = [nice_names[n] for n in names]

# use np to save latex table with & separator
rows = results_df.values.tolist()
rows = [list(row) for row in rows]
rows = [[nice_names[n]] + rows[i] for i, n in enumerate(names)]
txt = ' \\\\\n'.join([' & '.join(row) for row in rows])
print(txt)
with open('tables/hist-sym.tex', 'w') as f:
    f.write(txt)

In [None]:
from methods import birge, random_effects_mle
from scipy.stats import norm
from collections import defaultdict
results = {}
def fmt_result(r, bold=False):
    if bold:
        start = r'$\mathbf{'
        end = r'}$'
    else:
        start = r'$'
        end = r'$'
    if r == -np.inf:
        return r'$\approx-\infty$'
    elif r < -100:
        return start + str(int(r)) + end
    else:
        return start + r'{:.3g}'.format(r) + end
category_map = {k: 'chemistry' for k in chemistry_keys}
category_map.update({k: 'particle' for k in particle_keys})
category_map.update({k: 'historical' for k in historical_keys})

key_order = []

for n in names:
    values = datasets[n].value
    sigmas = datasets[n].uncertainty
    has_sigma = ~np.isnan(sigmas)
    values = values[has_sigma]
    sigmas = sigmas[has_sigma]
    if len(values) < 2:
        continue
    key_order.append(n)
    truth = truths[n][0] if hasattr(truths[n], '__iter__') else truths[n]
    # scaler = truth
    # values = values/scaler
    # sigmas = sigmas/scaler
    # truth = truth/scaler
    results[n] = {}

    _, muhat_birge, _, chat = birge(values, sigmas, coverage=0.6827, truth=truth)
    # brs.append(chat)
    # mean_sigma = np.mean(sigmas)
    # muhat_re, _, tau = random_effects_dl_base(values, sigmas)
    # taus.append(np.mean(tau/sigmas))
    _, muhat_re, _, tau = random_effects_mle(values, sigmas, coverage=0.6827, truth=truth)
    # taus.append(np.mean(tau/sigmas))
    # I2s.append(I2(values, sigmas))

    # # generate values with same sigmas but no unaccounted for errors.
    # # to be used as a control when analyzing the distribution of chat and tau
    # values_control = np.random.normal(loc=0, scale=sigmas)
    # _, _, _, chat_cont = birge(values_control, sigmas, coverage=0.6827)
    # brs_cont.append(chat_cont)
    # _, _, _, tau_cont = random_effects_mle(values_control, sigmas, coverage=0.6827)
    # taus_cont.append(np.mean(tau_cont/sigmas))
    # I2s_cont.append(I2(values_control, sigmas))

    # # errscale_ps.append(errscale_test(values, sigmas))
    # # errscale_ps_cont.append(errscale_test(values_control, sigmas))

    results[n]['name'] = nice_names[n]
    results[n]['count'] = len(values)
    # print(norm.pdf(values, loc=muhat_birge, scale=sigmas*chat))
    results[n]['birge_loglike'] = np.sum(norm.logpdf(values, loc=muhat_birge, scale=sigmas*chat))
    results[n]['re_loglike'] = np.sum(norm.logpdf(values, loc=muhat_re, scale=np.sqrt(sigmas**2+tau**2)))
    results[n]['fe_loglike'] = np.sum(norm.logpdf(values, loc=muhat_birge, scale=sigmas))

    category = category_map[n]
    if category not in results:
        # dict with default value of zero
        results[category] = defaultdict(int)
    results[category]['count'] += len(values)
    for r in ['birge_loglike', 're_loglike', 'fe_loglike']:
        results[category][r] += results[n][r]
    if 'total' not in results:
        results['total'] = defaultdict(int)
    results['total']['count'] += len(values)
    for r in ['birge_loglike', 're_loglike', 'fe_loglike']:
        results['total'][r] += results[n][r]



for res in results.values():
    res['count'] = fmt_result(res['count'])

    loglike_keys = ['birge_loglike', 're_loglike', 'fe_loglike']
    loglike_values = [res[k] for k in loglike_keys]
    largest_idx = np.argmax(loglike_values)
    if len(np.unique(loglike_values)) == 1:
        largest_idx = np.nan
    for i, r in enumerate(loglike_keys):
        res[r] = fmt_result(res[r], bold=(i==largest_idx))

results['total']['name'] = 'Total'
results['historical']['name'] = 'Total (historical)'
results['chemistry']['name'] = 'Total (chemistry)'
results['particle']['name'] = 'Total (particle)'

results_df = pd.DataFrame(results).T
# results_df.index = [r['name'] for r in results.values()]
# sort to be: chemistry, particle, total chemistry, total particle, total
row_order = key_order + ['historical','chemistry', 'particle', 'total']
results_df = results_df.loc[row_order]

# use np to save latex table with & separator
rows = results_df.values.tolist()
hline_idxs = [2, 2+len(chemistry_keys), 2+len(chemistry_keys)+len(particle_keys), -4, -1]
for i in hline_idxs:
    rows[i][0] = r'\hline ' + rows[i][0]
txt = ' \\\\\n'.join([' & '.join(row) for row in rows])
print(txt)
with open('tables/hist-syst.tex', 'w') as f:
    f.write(txt)

In [None]:
truths

In [None]:
y = np.array(datasets['agi'].value)
sigma = np.array(datasets['agi'].uncertainty)

import pymc as pm
print(y)
n = len(y)
width = np.max(y) - np.min(y)

with pm.Model() as model:
    theta = pm.Normal('theta', np.mean(y), np.mean(y))
    # scaler_rate = pm.Exponential('scaler_rate', 0.2)
    # scalers = pm.Exponential('scalers', scaler_rate, shape=n)+1
    # y_pred = pm.Normal('y_pred', theta, sigma*scalers, observed=y)

    # adder_rate = pm.Exponential('adder_rate', 10/width)
    adders = pm.Exponential('adders', 5/width, shape=n)
    y_pred = pm.Normal('y_pred', theta, np.sqrt(sigma**2+adders**2), observed=y)

    trace = pm.sample(draws=1000, tune=1000, chains=4, target_accept=0.99)

In [None]:
pm.plot_trace(trace)

In [None]:
for i in range(n):
    plt.hist(trace.posterior['scalers'].values[:,:,i].flatten()+1, bins=100, histtype='step')
plt.show()

In [None]:
trace.posterior['scalers'].values.shape

In [None]:
len(y)

In [None]:
plt.hist(trace.posterior['theta'].values.flatten(), bins=100)
qs = np.quantile(trace.posterior['theta'].values.flatten(), [0.16, 0.84])
plt.axvline(qs[0], color='red', linestyle='--', linewidth=1)
plt.axvline(qs[1], color='red', linestyle='--', linewidth=1)
ymax = plt.ylim()[1]
if 'scalers' in trace.posterior:
    scaler_qs = np.quantile(trace.posterior['scalers'].values, [0.25, 0.5, 0.75], axis=(0,1))+1
    print(scaler_qs.shape)
    for i in range(3):
        plt.errorbar(y, np.linspace(ymax/len(y), ymax, len(y)), xerr=sigma*scaler_qs[i], fmt='.', markersize=2, linewidth=1, color='black', capsize=2)
if 'adders' in trace.posterior:
    adder_qs = np.quantile(trace.posterior['adders'].values, [0.25, 0.5, 0.75], axis=(0,1))
    print(adder_qs.shape)
    for i in range(3):
        plt.errorbar(y, np.linspace(ymax/len(y), ymax, len(y)), xerr=sigma+adder_qs[i], fmt='.', markersize=2, linewidth=1, color='black', capsize=2)
# sigmas_adjust = sigma * np.median(trace.posterior['scalers'].values, axis=(0,1))

plt.errorbar(y, np.linspace(ymax/len(y), ymax, len(y)), xerr=sigma, fmt='.', markersize=2, linewidth=2, color='black')

# plt.ylim(0, ymax)
# plt.xlim(qs[0], qs[1])
plt.show()






