In [None]:
import pdg
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
from measurement_dist import measurement_dist

In [None]:

# point matplotlib ticks inwards
plt.rcParams["xtick.direction"] = "in"
plt.rcParams["ytick.direction"] = "in"
# add top and right ticks
# plt.rcParams["axes.spines.top"] = True
# plt.rcParams["axes.spines.right"] = True
plt.rcParams["xtick.top"] = True
plt.rcParams["ytick.right"] = True
# use tex
plt.rcParams["text.usetex"] = True



In [None]:
from pdg_data import load_pdg_data
pdg2025_stat_dfs, pdg2025_both_dfs, pdg2025_stat_quantities, pdg2025_both_quantities = load_pdg_data()

In [None]:
stat_dist = measurement_dist(pdg2025_stat_dfs)
both_dist = measurement_dist(pdg2025_both_dfs)

In [None]:
both_dist.keys()

In [None]:
plt.figure(figsize=(4,3))
zspace = stat_dist['zspace']
colors = {'stat': 'grey', 'both': 'red'}
dists = {'stat': stat_dist, 'both': both_dist}
plt.plot(zspace, stat_dist['pair'], label='stat', color='grey')
plt.plot(zspace, stat_dist['norm'], color='black', linestyle='dashed', linewidth=2, label='$|N(0,1)|$')
# plt.text(0.95, 0.5, r'$z=\frac{|y_1-y_2|}{\sqrt{\sigma_1^2+\sigma_2^2}}$', transform=plt.gca().transAxes, ha='right')

plt.legend(frameon=False)
plt.xlim(0, 2.5)
plt.ylim(0, 1)
plt.xlabel('$z$')
plt.ylabel('$P(Z>z)$')
plt.title('Empirical distribution of differences\nbetween studies in 2025 PDG data')

plt.savefig('figs/pdg_pairdists_stat.png', dpi=300, bbox_inches='tight')

plt.plot(zspace, both_dist['pair'], label='stat+syst', color='red')
plt.legend(frameon=False)
for sig in [0.5, 1, 2, 5]:
    for which in ['stat', 'both']:
        val = dists[which]['pair' + str(sig)][0]
        ci = np.array([dists[which]['pair_boot_ci_' + str(sig)]]).T
        ci[0,:] = val - ci[0,:]
        ci[1,:] = ci[1,:] - val
        plt.errorbar([sig], [val], yerr=ci, color=colors[which])


plt.savefig('figs/pdg_pairdists_statvsboth.png', dpi=300, bbox_inches='tight')
plt.xlim(0, 10)
plt.yscale('log')
plt.ylim(1e-4, 1)
plt.savefig('figs/pdg_pairdists_statvsboth_full.png', dpi=300, bbox_inches='tight')
plt.plot(zspace, both_dist['h'], label=r'stat+syst, $h$', color='lightblue')
plt.legend(frameon=False)
plt.savefig('figs/pdg_pairdists_statvsboth_h.png', dpi=300, bbox_inches='tight')
plt.plot(zspace, both_dist['hprime'], label=r'stat+syst, $h^\prime$', color='blue')
plt.legend(frameon=False)
plt.savefig('figs/pdg_pairdists_statvsboth_hprime.png', dpi=300, bbox_inches='tight')
# plt.xlim(0, 2.5)
# plt.yscale('linear')

In [None]:
from methods import birge, fixed_effect, random_effects_pm
from collections import defaultdict

widths = defaultdict(list)
intervals_birge = []
intervals_re = []
for df in pdg2025_both_dfs:
    values = np.array(df.value)
    sigmas = np.array(df.uncertainty)

    interval, _, _ = fixed_effect(values, sigmas, coverage=0.6827)
    widths['fe'].append(interval[1]-interval[0])
    interval, _, _, _ = birge(values, sigmas, coverage=0.6827, pdg=True)
    widths['birge'].append(interval[1]-interval[0])
    intervals_birge.append(interval)
    interval, _, _, _ = random_effects_pm(values, sigmas, coverage=0.6827)
    widths['pm'].append(interval[1]-interval[0])
    intervals_re.append(interval)
for key, value in widths.items():
    widths[key] = np.array(value)

In [None]:
plt.scatter(widths['birge']/widths['fe'], widths['pm']/widths['fe'], color='black', s=1, alpha=0.3)
plt.ylabel('Random Effects interval width / FE')
plt.xlabel('Birge Ratio interval width / FE')
plt.plot([0,20],[0,20], color='red')
plt.xlim(0,10)
plt.ylim(0,10)
plt.gca().set_aspect('equal')
plt.savefig('figs/pdg2025_width_scatter.png', dpi=300, bbox_inches='tight')

In [None]:
# import seaborn as sns
# sns.scatterplot(x=widths['pm']/widths['fe'], y=widths['birge']/widths['fe'], hue=data_types, s=5)
# plt.gca().set_aspect('equal')
# plt.xlim(0,10)
# plt.ylim(0,10)

In [None]:
plt.scatter(widths['birge']/widths['fe'], widths['pm']/widths['birge'], color='black', s=1, alpha=0.3)

In [None]:
width_ratios = widths['pm']/widths['birge']
same = np.isclose(width_ratios,1)
same_width_prop = np.mean(same)

plt.hist(width_ratios[~np.isclose(width_ratios,1)], bins=np.linspace(0,5,50), color='grey')
# data = {'widths': width_ratios[~np.isclose(width_ratios,1)], 'datatype': np.array(data_types)[~same]}
# sns.histplot(data, x='widths', hue='datatype', multiple='stack', bins=np.linspace(0,5,50))
plt.text(0.8, 0.5, f'Same width (not shown): {same_width_prop*100:2.1f}'+r'\%', transform=plt.gca().transAxes, ha='right')
plt.xlabel('Random Effects width / Birge Ratio width')
plt.ylabel('Count')
plt.xlim(0, 5)
plt.savefig('figs/pdg2025_width_hist.png', dpi=300, bbox_inches='tight')

In [None]:
df

In [None]:
print(np.min(width_ratios))
fig, axs = plt.subplots(1, 2, figsize=(8,4))
pdgids = []
for i, idx in enumerate([np.argmin(width_ratios), np.argmax(width_ratios)]):
    ax = axs[i]
    df = pdg2025_both_dfs[idx]
    pdgids.append(df['pdgid'].iloc[0])
    ax.errorbar(df['value'], np.arange(len(df)), xerr=df['uncertainty'], color='black', fmt='.')

    ax.set_yticks([])
    # ax.set_xticks([])
    
    ax.axvspan(intervals_re[idx][0], intervals_re[idx][1], color='lightgrey')
    ax.axvspan(intervals_birge[idx][0], intervals_birge[idx][1], color='red', alpha=0.2, linewidth=0)
# print(descriptions)
axs[0].set_title(r'RE $\ll$ Birge'+'\n'+pdgids[0])
axs[1].set_title(r'Birge $\ll$ RE'+'\n'+pdgids[1])
plt.savefig('figs/pdg2025_br_re_extremes.png', dpi=300, bbox_inches='tight')

In [None]:
from pdg_methods import birge_ratio


In [None]:
np.argmax(np.abs(wm1s-wm0s)/meanerrs)

In [None]:
print(wm0s[1141])
print(wm1s[1141])

In [None]:
pdg2025_both_dfs[1141]

In [None]:
from scipy.stats import norm
norm.cdf(-5,0,1.5)*2

In [None]:
import sqlite3
con = sqlite3.connect("data/pdgall-2025-v0.2.1.sqlite")
cur = con.cursor()
res = cur.execute("SELECT * FROM pdgdoc WHERE table_name='PDGID' AND column_name='DATA_TYPE'")
data = res.fetchall()
datatype_map = {row[3]: row[5] for row in data}
datatype_map[''] = 'Other'

In [None]:
datatype_map

In [None]:
from collections import defaultdict
df_groups = defaultdict(list)
data_types = []
for i, df in enumerate(pdg2025_both_dfs):
    data_type = df['data_type'].iloc[0]
    df_groups[data_type].append(df)
    data_types.append(datatype_map[data_type])


In [None]:
dists = {}
for data_type, dfs in df_groups.items():
    dists[data_type] = measurement_dist(dfs)

In [None]:
zspace = dists['G']['zspace']
for data_type, dist in dists.items():
    plt.plot(zspace, dist['pair'], label=datatype_map[data_type])
plt.plot(zspace, dists['G']['norm'], color='black')
plt.xlim(0, 2.5)
plt.legend()

In [None]:

for i in range(len(pdg2025_both_dfs)):
    # drop rows where stat_error_positive or stat_error_negative is NaN
    df = pdg2025_both_dfs[i].dropna(subset=['stat_error_positive', 'stat_error_negative'])
    # drop rows where error_positive or error_negative is NaN
    df = df.dropna(subset=['error_positive', 'error_negative'])
    # apply to the pdg2025_both_dfs
    pdg2025_both_dfs[i] = df

pdg2025_both_dfs[20]

props = []
techniques = []
data_types = []
years = []
descriptions = []
for df in pdg2025_both_dfs:
    props += list(df['stat_error_positive']**2/df['error_positive']**2)
    props += list(df['stat_error_negative']**2/df['error_negative']**2)
    techniques += list(df['technique'])*2
    years += list(df['year'])*2
    data_types += list(df['data_type'])*2
    descriptions += list(df['pdgid.description'])*2
data_types = [datatype_map[dt] for dt in data_types]
props = np.array(props)
years = np.array(years)

In [None]:
pdg2025_stat_dfs[300]

In [None]:
import seaborn as sns

techniques = pd.Series(techniques)
top_categories = techniques.value_counts().nlargest(9).index
data_types = pd.Series(data_types)

techniques_other = techniques.where(techniques.isin(top_categories), 'Other')
data = {'prop': props, 'technique': techniques_other, 'data_type': data_types, 'year': years, 'description': descriptions}
df = pd.DataFrame(data)
# sns.histplot(data=df, x='prop', hue='technique', multiple="stack", bins=51)
# sns.histplot(data=df[~df['data_type'].isin(['branching ratio', 'Other'])], x='prop', hue='data_type', multiple="stack", bins=21)
sns.histplot(data=df, x='prop', hue='data_type', multiple="stack", bins=21)

In [None]:
plt.scatter(df['prop'], df['year'], s=2, alpha=0.1, color='black')

In [None]:
syst = np.random.uniform(0, 1, 100000)
stat = np.random.uniform(0, 1, 100000)

plt.hist(stat**2 / (stat**2 + syst**2), bins=101)
plt.xlim(0.4, 0.6)
plt.show()

# round, keeping 2 decimal places if below 0.355, 1 decimal place if below 0.950, and round to 1 otherwise
def round_err(x):
    x[x < 0.355] = np.round(x[x < 0.355], 2)
    x[(x>=0.355) & (x < 0.950)] = np.round(x[(x>=0.355) & (x < 0.950)], 1)
    x[x >= 0.950] = np.round(x[x >= 0.950], 0)
    return x

syst = round_err(syst)
stat = round_err(stat)

plt.hist(stat**2 / (stat**2 + syst**2), bins=101)
plt.xlim(0.4, 0.6)
plt.show()

In [None]:
len(df)/2

In [None]:
df.iloc[20:50]

In [None]:
df[df['data_type'] == 'Other']['description'].unique()

In [None]:
top_categories

In [None]:
import seaborn as sns

plt.hist(props, bins=101, range=(0, 1), density=True, color='grey')
# plt.xlim(0.4, 0.6)
plt.title('Proportion of variance which is statistical in PDG data')
plt.show()

In [None]:
np.mean(props==0.5)
props[(props<0.5-1e-10) & (props>0.49)]
# np.mean((props<0.45) & (props>0.44))

In [None]:
pdg2025_both_dfs

In [None]:

api = pdg.connect("sqlite:///data/pdgall-2025-v0.2.0.sqlite")

In [None]:
print(api.doc_value_type_keys())

In [None]:
con = sqlite3.connect("data/pdgall-2025-v0.2.0.sqlite")
cur = con.cursor()
command = """
SELECT pdgid.description, pdgmeasurement.pdgid, pdgdata.value_type, pdgdata.in_summary_table, pdgdata.value, pdgmeasurement_values.value, pdgmeasurement_values.error_positive, pdgmeasurement_values.error_negative
FROM pdgmeasurement_values
     JOIN pdgmeasurement ON pdgmeasurement.id = pdgmeasurement_values.pdgmeasurement_id
     JOIN pdgid ON pdgid.id = pdgmeasurement.pdgid_id
     JOIN pdgdata ON pdgdata.pdgid_id = pdgid.id
--     JOIN pdgparticle ON pdgparticle.pdgid = pdgid.parent_pdgid
WHERE pdgmeasurement_values.used_in_average AND pdgmeasurement_values.value IS NOT NULL AND pdgdata.edition = '2025' AND pdgdata.value_type = 'AC'
"""
res = cur.execute(command)
data = res.fetchall()  # WHERE
columns = [col[0] for col in res.description]
print(len(data), "measurements")
print(columns)

In [None]:
# cur.execute("SELECT * FROM pdgmeasurement").fetchall()

In [None]:
df = pd.DataFrame(
    data,
    columns=[
        "pdgid.description",
        "pdgid",
        "type",
        "insummary",
        "avg",
        "measurement",
        "error_positive",
        "error_negative",
    ],
)
df["error"] = (df["error_positive"] + df["error_negative"]) / 2
df["std_resid"] = (df["measurement"] - df["avg"]) / df["error"]
# only keep rows where there are at least 3 measurements
df = df.groupby("pdgid").filter(lambda x: len(x) >= 3)
print("Number of properties:", len(df["pdgid"].unique()))
print("Number of measurements:", len(df))


# for each pdgid, do some operations on each row with that pdgid
def process_group(group):
    n = len(group)
    sigma = np.array(group["error"])
    sigma2 = sigma**2
    # sigma2 = np.ones(n)

    S = np.sum(1 / sigma2)

    Xbar = np.sum(group["measurement"] / sigma2) / S
    # print(Xbar, group['avg'].iloc[0])
    std = np.sqrt(sigma2 * (1 - 1 / (sigma2 * S)) ** 2 + (S - 1 / sigma2) / (S**2))
    # print(std)
    group["std_resid_adj"] = (group["measurement"] - group["avg"]) / std
    # group['std_resid_adj'] = (group['measurement'] - group['avg']) / sigma
    # print(group)
    return group


# process_group(df[df['pdgid'] == 'Q007TP'])
df_gb = df.groupby("pdgid", group_keys=False)
dfs = [df_gb.get_group(x) for x in df_gb.groups]
df = df.groupby("pdgid").apply(process_group, include_groups=False)
df
# df = df[df['pdgid.description'].str.contains('MASS')]

In [None]:
len(dfs)

In [None]:
from methods import birge, random_effects_dl_base, random_effects_mle, I2, errscale_test
from scipy.stats import norm
from tqdm import tqdm

brs = []
taus = []
I2s = []
errscale_ps = []
brs_cont = []
taus_cont = []
I2s_cont = []
errscale_ps_cont = []
# birge_logprobs = []
# re_logprobs = []
# fe_logprobs = []
# mix_logprobs = []
bad = [
    "M047R7",
    "M002R19",
    "M049R52",
    "M055R6",
    "M053R02",
    "M052R4",
    "M056R4",
    "M057R4",
    "M070R24",
    "M070R50",
    "M070R60",
    "M070R7",
    "M070R82",
    "M070R83",
    "M070R84",
    "M070R86",
    "M070R87",
    "M070R9",
    "M070S6",
    "M071R22",
    "M071R28",
    "M071S10",
    "S040R11",
    "S041B24",
    "S041B41",
    "S041C5",
    "S041R3",
    "S041R39",
    "S041R90",
    "S041S47",
    "S041R65",
    "S041S50",
    "S041T03",
    "S042B26",
    "S042B27",
    "S042B43",
    "S042B47",
    "S042B58",
    "S042P59",
    "S042R2",
    "S042R20",
    "S042R22",
    "S042R23",
    "S042R3",
    "S042R47",
    "S042R48",
    "S042S24",
    "S042S59",
    "S049R21",
    "S049S7",
    "S049R24",
    "S042S88",
    "S086R3",
    "S086R33",
    "S086R32",
    "S086R8",
    "S086R34",
    "S086R6",
]

birge_loglikes = []
re_loglikes = []
fe_loglikes = []
ns = []

for i, property in tqdm(enumerate(dfs), total=len(dfs)):
    if property["pdgid"].iloc[0] in bad:
        continue
    values = np.array(property["measurement"])
    sigmas = np.array(property["error"])
    # values = values-np.mean(values)
    scaler = np.std(values)
    if scaler == 0:
        continue
    values = values / scaler
    sigmas = sigmas / scaler

    # sigmas = sigmas/np.mean(sigmas)
    _, muhat_birge, _, chat = birge(values, sigmas, coverage=0.6827)
    brs.append(chat)
    mean_sigma = np.mean(sigmas)
    # muhat_re, _, tau = random_effects_dl_base(values, sigmas)
    # taus.append(np.mean(tau/sigmas))
    _, muhat_re, _, tau = random_effects_mle(values, sigmas, coverage=0.6827)
    taus.append(np.mean(tau / sigmas))
    I2s.append(I2(values, sigmas))

    # generate values with same sigmas but no unaccounted for errors.
    # to be used as a control when analyzing the distribution of chat and tau
    values_control = np.random.normal(loc=0, scale=sigmas)
    _, _, _, chat_cont = birge(values_control, sigmas, coverage=0.6827)
    brs_cont.append(chat_cont)
    _, _, _, tau_cont = random_effects_mle(values_control, sigmas, coverage=0.6827)
    taus_cont.append(np.mean(tau_cont / sigmas))
    I2s_cont.append(I2(values_control, sigmas))

    # errscale_ps.append(errscale_test(values, sigmas))
    # errscale_ps_cont.append(errscale_test(values_control, sigmas))

    birge_loglikes.append(
        np.log(np.prod(norm.pdf(values, loc=muhat_birge, scale=sigmas * chat)))
    )
    if any(np.array(birge_loglikes) == -np.inf):
        print(i)
        break
    re_loglikes.append(
        np.log(
            np.prod(norm.pdf(values, loc=muhat_re, scale=np.sqrt(sigmas**2 + tau**2)))
        )
    )
    fe_loglikes.append(np.log(np.prod(norm.pdf(values, loc=muhat_birge, scale=sigmas))))
    ns.append(len(property))
    # birge_probs = []
    # re_probs = []
    # fe_probs = []
    # mix_probs = []

    # for j in range(400):
    #     spike = np.random.rand() < 0.5
    #     if spike:
    #         br = 1
    #         tau = 0
    #     else:
    #         br = np.random.exponential(1)+1
    #         tau = np.random.exponential(1)
    #     mu = np.random.standard_cauchy()
    #     birge_probs.append(np.prod(norm.pdf(values, loc=mu, scale=sigmas*br)))
    #     # if np.any(np.log(norm.pdf(values, loc=mu, scale=sigmas*br))==-np.inf):
    #     #     print(i)
    #     #     print(property)
    #     #     print('BR:', br)
    #     #     print(mu)
    #     #     print(values)
    #     #     print(sigmas*br)
    #     #     print(norm.pdf(values, loc=mu, scale=sigmas*br))
    #     #     raise ValueError("Log probability is -inf, check values and sigmas.")
    #     # print(norm.pdf(values, loc=mu, scale=sigmas*br))
    #     re_probs.append(np.prod(norm.pdf(values, loc=mu, scale=np.sqrt(sigmas**2+tau**2))))
    #     fe_probs.append(np.prod(norm.pdf(values, loc=mu, scale=sigmas)))

    #     mix_probs.append(np.prod(norm.pdf(values, loc=mu, scale=np.sqrt((br*sigmas)**2 + tau**2))))
    # if np.mean(birge_probs) == 0:
    #     print(i, property)
    #     raise ValueError("Mean of birge_probs is zero, check values and sigmas.")

    # birge_logprobs.append(np.log(np.mean(birge_probs)))
    # re_logprobs.append(np.log(np.mean(re_probs)))
    # fe_logprobs.append(np.log(np.mean(fe_probs)))
    # mix_logprobs.append(np.log(np.mean(mix_probs)))

# birge_logprobs = np.array(birge_logprobs)
# re_logprobs = np.array(re_logprobs)
# fe_logprobs = np.array(fe_logprobs)
birge_loglikes = np.array(birge_loglikes)
re_loglikes = np.array(re_loglikes)
fe_loglikes = np.array(fe_loglikes)
ns = np.array(ns)

In [None]:
# plt.hist(birge_logprobs - re_logprobs, bins=100, color='grey')
# plt.show()

In [None]:
plt.hist(I2s)

In [None]:
plt.hist(birge_loglikes - re_loglikes, bins=100, color="grey")
plt.show()

In [None]:
# dfs[np.argmax(birge_logprobs - re_logprobs)]

In [None]:
# print('log probabilities')
# print(np.sum(birge_logprobs))
# print(np.sum(re_logprobs))
# print(np.sum(fe_logprobs))
# print(np.sum(mix_logprobs))
print("log likelihoods")
loglikes = np.array([np.sum(birge_loglikes), np.sum(re_loglikes), np.sum(fe_loglikes)])
print(loglikes)
birge_bics = 2 * np.log(ns) - 2 * birge_loglikes
re_bics = 2 * np.log(ns) - 2 * re_loglikes
fe_bics = 1 * np.log(ns) - 2 * fe_loglikes
birge_aics = 2 * 2 - 2 * birge_loglikes
re_aics = 2 * 2 - 2 * re_loglikes
fe_aics = 2 * 1 - 2 * fe_loglikes

col1 = loglikes
col2 = np.array([np.mean(birge_bics), np.mean(re_bics), np.mean(fe_bics)])
col3 = np.array([np.mean(birge_aics), np.mean(re_aics), np.mean(fe_aics)])
colnames = ["log-likelihood", "BIC", "AIC"]
rownames = ["Birge Ratio", "Random Effects", "Fixed Effects"]
df = pd.DataFrame(np.array([col1, col2, col3]).T, columns=colnames, index=rownames)

# print in latex format
print(df.to_latex(index=True, float_format="%.2f"))

In [None]:
plt.scatter(
    birge_loglikes, re_loglikes, marker=".", s=4, edgecolor="none", c=ns, vmin=0
)
ymin = np.min([np.min(birge_loglikes), np.min(re_loglikes)])
ymax = np.max([np.max(birge_loglikes), np.max(re_loglikes)])
plt.plot(
    [ymin - 1, ymax + 1], [ymin - 1, ymax + 1], color="red", linewidth=1, linestyle=":"
)
# set aspect ratio to 1
plt.gca().set_aspect("equal", adjustable="box")
plt.xlim(ymin - 0.2, ymax + 0.2)
plt.ylim(ymin - 0.2, ymax + 0.2)
plt.xlabel("Birge Ratio log-likelihood")
plt.ylabel("Random Effects log-likelihood")
plt.colorbar(label="Number of measurements")
re_better_percent = np.round(np.mean(re_loglikes > birge_loglikes) * 100, 1)
birge_better_percent = np.round(np.mean(birge_loglikes > re_loglikes) * 100, 1)
plt.text(
    0.1,
    0.9,
    f"Random Effects better ({re_better_percent}\\%)",
    transform=plt.gca().transAxes,
    fontsize=12,
    fontweight="bold",
    color="black",
    ha="left",
    va="top",
)
plt.text(
    0.9,
    0.1,
    f"Birge Ratio better ({birge_better_percent}\\%)",
    transform=plt.gca().transAxes,
    fontsize=12,
    fontweight="bold",
    color="black",
    ha="right",
    va="bottom",
)
plt.title("Random Effects and Birge Ratio \n MLE log-likelihoods for each property")
plt.savefig("figs/pdg_loglike.pdf", bbox_inches="tight")
plt.show()
plt.hist(re_loglikes - birge_loglikes, bins=100, density=True, color="grey")
plt.axvline(0, color="black")
plt.axvline(np.mean(re_loglikes - birge_loglikes), color="red", linestyle="--")
plt.show()

In [None]:
np.mean(np.log(ns))

In [None]:
np.mean(re_loglikes == birge_loglikes)

In [None]:
np.mean(re_loglikes > birge_loglikes)

In [None]:
print(2 * len(ns) * np.log(np.sum(ns)) - 2 * np.sum(birge_loglikes))
print(2 * len(ns) * np.log(np.sum(ns)) - 2 * np.sum(re_loglikes))
print(1 * len(ns) * np.log(np.sum(ns)) - 2 * np.sum(fe_loglikes))

In [None]:
plt.hist(np.array(errscale_ps)[np.array(I2s) > 0])
plt.show()
plt.hist(np.array(errscale_ps_cont)[np.array(I2s_cont) > 0])
plt.show()

In [None]:
brs = np.array(brs)
taus = np.array(taus)
brs_cont = np.array(brs_cont)
taus_cont = np.array(taus_cont)

brs_big = brs[brs > 1]
taus_big = taus[taus > 0]
brs_cont_big = brs_cont[brs_cont > 1]
taus_cont_big = taus_cont[taus_cont > 0]

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].hist(
    brs_big, range=(1, 4), bins=30, color="grey", label="PDG data"
)  # , weights=np.ones(len(brs_big))/len(brs_big))
axs[0].hist(
    brs_cont_big,
    range=(1, 4),
    bins=30,
    color="black",
    histtype="step",
    label=f"Control experiment\n(no systematics)\n(${int(np.mean(brs_cont==1)*100)}\%=1$)",
)
axs[0].set_title(
    rf"Non-unity Birge ratios within each property (${int(np.mean(brs==1)*100)}\%=1$)"
)
axs[0].set_xlim(1, 4)
axs[0].set_xlabel(r"Estimated Birge ratio of a property")
axs[0].set_ylabel("Count")
axs[0].legend(frameon=False)

axs[1].hist(
    taus_big, range=(0, 3), bins=30, color="grey", label="PDG data"
)  # , weights=np.ones(len(taus_big))/len(taus_big))
axs[1].hist(
    taus_cont_big,
    range=(0, 3),
    bins=30,
    color="black",
    histtype="step",
    label=f"Control experiment\n(no systematics)\n(${int(np.mean(taus_cont==0)*100)}\%=0$)",
)
axs[1].set_title(
    rf"Non-zero mean ratios $\hat\tau/\sigma_i$ within each property (${int(np.mean(taus==0)*100)}\%=0$)"
)
axs[1].set_xlim(0, 3)
axs[1].set_xlabel(r"Mean ratio $\hat\tau/\sigma_i$ within a property")
axs[1].set_ylabel("Count")
axs[1].legend(frameon=False)

plt.savefig("figs/pdg_birge_re.pdf", bbox_inches="tight")
plt.show()

In [None]:
brs_cont

In [None]:
np.mean(np.array(brs) == 1)

In [None]:
np.max(taus)

In [None]:
np.sum(np.isnan(taus))

In [None]:
np.max(taus)

In [None]:
np.argmax(taus)
dfs[1231]

In [None]:
df

In [None]:
df["pdgid.description"][df["pdgid.description"].str.contains("MASS")].unique()

In [None]:
type(df)

In [None]:
df.groups

In [None]:
df

In [None]:
df["limit"].unique()

In [None]:
df[df["std_resid"] == 0]

In [None]:
plt.hist(
    df["std_resid_adj"],
    bins=100,
    range=(-5, 5),
    density=True,
    color="grey",
    label="Standardized residuals",
)
# plot normal pdf
from scipy.stats import norm

x = np.linspace(-5, 5, 100)
plt.xlim(-5, 5)
plt.axvline(0, color="black", linestyle="--")
plt.plot(x, norm.pdf(x, 0, 1), color="red", label="Standard Normal PDF")
plt.title("Standardized residuals of PDG measurements")
plt.legend(frameon=False)
plt.savefig("figs/pdg_std_residuals.pdf", bbox_inches="tight")
plt.show()

In [None]:
## qq plot
import statsmodels.api as sm

sm.qqplot(df["std_resid_adj"])
plt.ylim(-7, 7)
plt.xlim(-7, 7)

In [None]:
# ks test
from scipy.stats import kstest

ks_stat, ks_pvalue = kstest(df["std_resid_adj"], "norm")
ks_pvalue

In [None]:
n = 5
data = np.random.normal(0, 1, (100000, n))
avg = np.mean(data, axis=1)
resid = data - avg[:, None]
plt.hist(resid.flatten(), bins=100, range=(-5, 5), density=True)
plt.plot(x, norm.pdf(x, 0, np.sqrt((n - 1) / n)), color="red", label="Normal PDF")

In [None]:
data

In [None]:
len(data)

In [None]:
cur.execute("SELECT * FROM pdgdata").fetchall()

In [None]:
data

In [None]:
api.editions

In [None]:
particle = api.get_particle_by_name("t")
measurement = list(particle.mass_measurements())[0]

In [None]:
dir(particle)

In [None]:
particle

In [None]:
measurement

In [None]:
con = sqlite3.connect("data/pdgall-2025-v0.2.0.sqlite")
cur = con.cursor()
command = """
SELECT pdgid.description, pdgmeasurement.pdgid, pdgdata.value_type, pdgdata.in_summary_table, pdgdata.value, pdgmeasurement_values.value, pdgmeasurement_values.error_positive, pdgmeasurement_values.error_negative
FROM pdgmeasurement_values
     JOIN pdgmeasurement ON pdgmeasurement.id = pdgmeasurement_values.pdgmeasurement_id
     JOIN pdgid ON pdgid.id = pdgmeasurement.pdgid_id
     JOIN pdgdata ON pdgdata.pdgid_id = pdgid.id
--     JOIN pdgparticle ON pdgparticle.pdgid = pdgid.parent_pdgid
WHERE pdgmeasurement_values.value IS NOT NULL AND pdgdata.edition = '2025'
"""
res = cur.execute(command)
data = res.fetchall()  # WHERE
columns = [col[0] for col in res.description]
print(len(data), "measurements")
print(columns)
df = pd.DataFrame(
    data,
    columns=[
        "pdgid.description",
        "pdgid",
        "type",
        "insummary",
        "avg",
        "measurement",
        "error_positive",
        "error_negative",
    ],
)
df["error"] = (df["error_positive"] + df["error_negative"]) / 2
df["std_resid"] = (df["measurement"] - df["avg"]) / df["error"]
# only keep rows where there are at least 3 measurements
df = df.groupby("pdgid").filter(lambda x: len(x) >= 3)
print("Number of properties:", len(df["pdgid"].unique()))
print("Number of measurements:", len(df))
df_gb = df.groupby("pdgid", group_keys=False)
dfs = [df_gb.get_group(x) for x in df_gb.groups]

In [None]:
ns = []
for df in dfs:
    ns.append(len(df))
biggest = np.argpartition(ns, -10)[-10:]
for idx in biggest:
    print(dfs[idx])

In [None]:
dfs[np.argmax(ns)]

In [None]:
import pymc as pm
from pymc_extras.inference import fit_dadvi
import random
# to_include=100
to_include = np.inf
with pm.Model() as model:
    N = min(len(pdg2025_both_dfs), to_include)
    # dataset_perm = random.sample(pdg2025_both_dfs, N)
    dataset_perm = pdg2025_both_dfs

    all_ys = []
    all_sigmas = []
    ds_idx = []
    scales = []
    for i, df in enumerate(dataset_perm):
        # scale = np.median(df['uncertainty'])
        scale = np.mean(df['uncertainty'])
        # shift = np.median(df['value'])
        shift = np.mean(df['value'])
        ys = (np.array(df['value']) - shift) / scale
        sigmas = np.array(df['uncertainty']) / scale
        all_ys.append(ys)
        all_sigmas.append(sigmas)
        ds_idx.append(np.ones(len(ys)) * i)
        scales.append(scale)
    all_ys = np.concatenate(all_ys)
    all_sigmas = pm.Data('sigmas', np.concatenate(all_sigmas))
    ds_idx = np.concatenate(ds_idx).astype(int)
    # ds_idx = pm.Data('ds_idx', ds_idx, dims="obs")
    print(all_ys.shape, all_sigmas.shape, ds_idx.shape)

    alpha = pm.Exponential('alpha', 1)
    beta = pm.Exponential('beta', 1)
    c = pm.Gamma('c', alpha, beta, shape=N)+1
    theta = pm.Normal('theta', 0, 100, shape=N)
    # random_effect = pm.Normal('random_effect', 0, tau[ds_idx])
    # theta_obs = pm.Normal('theta_obs', theta[ds_idx] + random_effect[ds_idx], tau[ds_idx])
    y_obs = pm.Normal('y_obs', theta[ds_idx], all_sigmas*c[ds_idx], observed=all_ys)

    print('model defined')
pm.model_to_graphviz(model)

In [None]:
with model:
    res = pm.sample(tune=4000, draws=4000, target_accept=0.95)

In [None]:
from methods import birge
brs = []
intervals = []
for df in pdg2025_both_dfs:
    interval, _, _, br = birge(df.value, df.uncertainty, coverage=0.6827)
    brs.append(br)
    intervals.append(interval)

In [None]:
np.random.choice(10)

In [None]:
pos = res.posterior
for i in range(20):
    # plt.figure(figsize=(3,2))
    fig, axs = plt.subplots(1,2,figsize=(5,2))
    idx = np.random.choice(len(pdg2025_both_dfs))
    df = pdg2025_both_dfs[idx]
    plt.title(df['pdgid'].iloc[0] + ': n='+str(len(df)))
    axs[0].hist(pos['c'].values[:,:,idx].flatten()+1, bins=np.linspace(1,10,200), color='grey')
    axs[0].axvline(brs[idx], color='red')
    axs[0].set_xlim(0.8,5)

    theta_quantiles = np.quantile(pos['theta'].values[:,:,idx].flatten(), q=[0.15865,0.84135])
    theta_quantiles = theta_quantiles * np.mean(df['uncertainty']) + np.mean(df['value'])

    axs[1].errorbar(df.value, np.arange(len(df)), xerr=df.uncertainty, fmt='.', color='black')
    axs[1].set_yticks([])
    axs[1].set_xticks([])
    axs[1].axvspan(intervals[idx][0], intervals[idx][1], color='grey', alpha=1, linewidth=0)
    axs[1].axvspan(theta_quantiles[0], theta_quantiles[1], color='red', alpha=0.3, linewidth=0)
    plt.show()

In [None]:
from scipy.stats import chi2
import numpy as np
import matplotlib.pyplot as plt

In [None]:
x = np.linspace(1, 4, 100)
ns = [2, 3, 4, 5, 100]
for n in ns:
    plt.plot(x, chi2.cdf((n-1) * x**2, df=n-1), label=n)
plt.ylim(0,1)
plt.legend()
plt.xlabel('Inferred Scale Factor')
plt.ylabel('CDF')
plt.title('Inferred scale factors when truth is 1')

In [None]:
x = np.linspace(1, 4, 100)
ns = [2, 3, 5, 10, 100]
truth = 2
for n in ns:
    plt.plot(x, chi2.cdf((n-1) * x**2, df=n-1, scale=truth**2), label=n)
plt.ylim(0,1)
plt.axvline(truth, color='red')
plt.legend()
plt.xlabel('Inferred Scale Factor')
plt.ylabel('CDF')
plt.title(f'Inferred scale factors when truth is {truth}')