In [None]:
import pdg
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt

# point matplotlib ticks inwards
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
# add top and right ticks
plt.rcParams['axes.spines.top'] = True
plt.rcParams['axes.spines.right'] = True
# use tex
plt.rcParams['text.usetex'] = True

api = pdg.connect('sqlite:///data/pdgall-2025-v0.2.0.sqlite')

In [None]:
print(api.doc_value_type_keys())

In [None]:
con = sqlite3.connect('data/pdgall-2025-v0.2.0.sqlite')
cur = con.cursor()
command = """
SELECT pdgid.description, pdgmeasurement.pdgid, pdgdata.value_type, pdgdata.in_summary_table, pdgdata.value, pdgmeasurement_values.value, pdgmeasurement_values.error_positive, pdgmeasurement_values.error_negative
FROM pdgmeasurement_values
     JOIN pdgmeasurement ON pdgmeasurement.id = pdgmeasurement_values.pdgmeasurement_id
     JOIN pdgid ON pdgid.id = pdgmeasurement.pdgid_id
     JOIN pdgdata ON pdgdata.pdgid_id = pdgid.id
--     JOIN pdgparticle ON pdgparticle.pdgid = pdgid.parent_pdgid
WHERE pdgmeasurement_values.used_in_average AND pdgmeasurement_values.value IS NOT NULL AND pdgdata.edition = '2025' AND pdgdata.value_type = 'AC'
"""
res = cur.execute(command)
data = res.fetchall() #WHERE 
columns = [col[0] for col in res.description]
print(len(data), 'measurements')
print(columns)

In [None]:
# cur.execute("SELECT * FROM pdgmeasurement").fetchall()

In [None]:
df = pd.DataFrame(data, columns=['pdgid.description', 'pdgid', 'type', 'insummary', 'avg', 'measurement', 'error_positive', 'error_negative'])
df['error'] = (df['error_positive'] + df['error_negative'])/2
df['std_resid'] = (df['measurement'] - df['avg']) / df['error']
# only keep rows where there are at least 3 measurements
df = df.groupby('pdgid').filter(lambda x: len(x) >= 3)
print('Number of properties:', len(df['pdgid'].unique()))
print('Number of measurements:', len(df))

# for each pdgid, do some operations on each row with that pdgid
def process_group(group):
    n = len(group)
    sigma = np.array(group['error'])
    sigma2 = sigma**2
    #sigma2 = np.ones(n)

    S = np.sum(1/sigma2)

    Xbar = np.sum(group['measurement'] / sigma2) / S
    # print(Xbar, group['avg'].iloc[0])
    std = np.sqrt(sigma2*(1-1/(sigma2*S))**2 + (S-1/sigma2)/(S**2))
    # print(std)
    group['std_resid_adj'] = (group['measurement'] - group['avg']) / std
    #group['std_resid_adj'] = (group['measurement'] - group['avg']) / sigma
    # print(group)
    return group
# process_group(df[df['pdgid'] == 'Q007TP'])
df_gb = df.groupby('pdgid', group_keys=False)
dfs = [df_gb.get_group(x) for x in df_gb.groups]
df = df.groupby('pdgid').apply(process_group, include_groups=False)
df
# df = df[df['pdgid.description'].str.contains('MASS')]

In [None]:
len(dfs)

In [None]:
from methods import birge, random_effects_dl_base, random_effects_mle, I2, errscale_test
from scipy.stats import norm
from tqdm import tqdm
brs = []
taus = []
I2s = []
errscale_ps = []
brs_cont = []
taus_cont = []
I2s_cont = []
errscale_ps_cont = []
# birge_logprobs = []
# re_logprobs = []
# fe_logprobs = []
# mix_logprobs = []
bad = ['M047R7', 'M002R19', 'M049R52', 'M055R6', 'M053R02', 'M052R4', 'M056R4', 'M057R4', 'M070R24', 'M070R50', 'M070R60', 'M070R7', 'M070R82', 'M070R83', 'M070R84', 'M070R86', 'M070R87','M070R9', 'M070S6', 'M071R22', 'M071R28','M071S10', 'S040R11', 'S041B24', 'S041B41', 'S041C5', 'S041R3', 'S041R39', 'S041R90', 'S041S47', 'S041R65', 'S041S50', 'S041T03', 'S042B26', 'S042B27', 'S042B43', 'S042B47', 'S042B58', 'S042P59', 'S042R2', 'S042R20', 'S042R22', 'S042R23', 'S042R3', 'S042R47', 'S042R48', 'S042S24', 'S042S59', 'S049R21', 'S049S7', 'S049R24', 'S042S88', 'S086R3', 'S086R33', 'S086R32', 'S086R8', 'S086R34', 'S086R6']

birge_loglikes = []
re_loglikes = []
fe_loglikes = []
ns = []

for i, property in tqdm(enumerate(dfs), total=len(dfs)):
    
    if property['pdgid'].iloc[0] in bad:
        continue
    values = np.array(property['measurement'])
    sigmas = np.array(property['error'])
    # values = values-np.mean(values)
    scaler = np.std(values)
    if scaler == 0:
        continue
    values = values / scaler
    sigmas = sigmas / scaler


    # sigmas = sigmas/np.mean(sigmas)
    _, muhat_birge, _, chat = birge(values, sigmas, coverage=0.6827)
    brs.append(chat)
    mean_sigma = np.mean(sigmas)
    # muhat_re, _, tau = random_effects_dl_base(values, sigmas)
    # taus.append(np.mean(tau/sigmas))
    _, muhat_re, _, tau = random_effects_mle(values, sigmas, coverage=0.6827)
    taus.append(np.mean(tau/sigmas))
    I2s.append(I2(values, sigmas))

    # generate values with same sigmas but no unaccounted for errors.
    # to be used as a control when analyzing the distribution of chat and tau
    values_control = np.random.normal(loc=0, scale=sigmas)
    _, _, _, chat_cont = birge(values_control, sigmas, coverage=0.6827)
    brs_cont.append(chat_cont)
    _, _, _, tau_cont = random_effects_mle(values_control, sigmas, coverage=0.6827)
    taus_cont.append(np.mean(tau_cont/sigmas))
    I2s_cont.append(I2(values_control, sigmas))

    # errscale_ps.append(errscale_test(values, sigmas))
    # errscale_ps_cont.append(errscale_test(values_control, sigmas))


    birge_loglikes.append(np.log(np.prod(norm.pdf(values, loc=muhat_birge, scale=sigmas*chat))))
    if any(np.array(birge_loglikes)==-np.inf):
        print(i)
        break
    re_loglikes.append(np.log(np.prod(norm.pdf(values, loc=muhat_re, scale=np.sqrt(sigmas**2+tau**2)))))
    fe_loglikes.append(np.log(np.prod(norm.pdf(values, loc=muhat_birge, scale=sigmas))))
    ns.append(len(property))
    # birge_probs = []
    # re_probs = []
    # fe_probs = []
    # mix_probs = []

    # for j in range(400):
    #     spike = np.random.rand() < 0.5
    #     if spike:
    #         br = 1
    #         tau = 0
    #     else:
    #         br = np.random.exponential(1)+1
    #         tau = np.random.exponential(1)
    #     mu = np.random.standard_cauchy()
    #     birge_probs.append(np.prod(norm.pdf(values, loc=mu, scale=sigmas*br)))
    #     # if np.any(np.log(norm.pdf(values, loc=mu, scale=sigmas*br))==-np.inf):
    #     #     print(i)
    #     #     print(property)
    #     #     print('BR:', br)
    #     #     print(mu)
    #     #     print(values)
    #     #     print(sigmas*br)
    #     #     print(norm.pdf(values, loc=mu, scale=sigmas*br))
    #     #     raise ValueError("Log probability is -inf, check values and sigmas.")
    #     # print(norm.pdf(values, loc=mu, scale=sigmas*br))
    #     re_probs.append(np.prod(norm.pdf(values, loc=mu, scale=np.sqrt(sigmas**2+tau**2))))
    #     fe_probs.append(np.prod(norm.pdf(values, loc=mu, scale=sigmas)))

    #     mix_probs.append(np.prod(norm.pdf(values, loc=mu, scale=np.sqrt((br*sigmas)**2 + tau**2))))
    # if np.mean(birge_probs) == 0:
    #     print(i, property)
    #     raise ValueError("Mean of birge_probs is zero, check values and sigmas.")
    
    # birge_logprobs.append(np.log(np.mean(birge_probs)))
    # re_logprobs.append(np.log(np.mean(re_probs)))
    # fe_logprobs.append(np.log(np.mean(fe_probs)))
    # mix_logprobs.append(np.log(np.mean(mix_probs)))

# birge_logprobs = np.array(birge_logprobs)
# re_logprobs = np.array(re_logprobs)
# fe_logprobs = np.array(fe_logprobs)
birge_loglikes = np.array(birge_loglikes)
re_loglikes = np.array(re_loglikes)
fe_loglikes = np.array(fe_loglikes)
ns = np.array(ns)

In [None]:
plt.hist(birge_logprobs - re_logprobs, bins=100, color='grey')
plt.show()

In [None]:
plt.hist(I2s)

In [None]:
plt.hist(birge_loglikes - re_loglikes, bins=100, color='grey')
plt.show()

In [None]:
dfs[np.argmax(birge_logprobs - re_logprobs)]

In [None]:
# print('log probabilities')
# print(np.sum(birge_logprobs))
# print(np.sum(re_logprobs))
# print(np.sum(fe_logprobs))
# print(np.sum(mix_logprobs))
print('log likelihoods')
loglikes = np.array([np.sum(birge_loglikes), np.sum(re_loglikes), np.sum(fe_loglikes)])
print(loglikes)
birge_bics = 2 * np.log(ns) - 2 * birge_loglikes
re_bics = 2 * np.log(ns) - 2 * re_loglikes
fe_bics = 1 * np.log(ns) - 2 * fe_loglikes
birge_aics = 2 * 2 - 2 * birge_loglikes
re_aics = 2 * 2 - 2 * re_loglikes
fe_aics = 2 * 1 - 2 * fe_loglikes

col1 = loglikes
col2 = np.array([np.mean(birge_bics), np.mean(re_bics), np.mean(fe_bics)])
col3 = np.array([np.mean(birge_aics), np.mean(re_aics), np.mean(fe_aics)])
colnames = ['log-likelihood', 'BIC', 'AIC']
rownames = ['Birge Ratio', 'Random Effects', 'Fixed Effects']
df = pd.DataFrame(np.array([col1, col2, col3]).T, columns=colnames, index=rownames)

# print in latex format
print(df.to_latex(index=True, float_format='%.2f'))



In [None]:
plt.scatter(birge_loglikes, re_loglikes, marker='.', s=4, edgecolor='none', c=ns, vmin=0)
ymin = np.min([np.min(birge_loglikes), np.min(re_loglikes)])
ymax = np.max([np.max(birge_loglikes), np.max(re_loglikes)])
plt.plot([ymin-1, ymax+1], [ymin-1, ymax+1], color='red', linewidth=1, linestyle=':')
# set aspect ratio to 1
plt.gca().set_aspect('equal', adjustable='box')
plt.xlim(ymin-0.2, ymax+0.2)
plt.ylim(ymin-0.2, ymax+0.2)
plt.xlabel('Birge Ratio log-likelihood')
plt.ylabel('Random Effects log-likelihood')
plt.colorbar(label='Number of measurements')
re_better_percent = np.round(np.mean(re_loglikes > birge_loglikes) * 100,1)
birge_better_percent = np.round(np.mean(birge_loglikes > re_loglikes) * 100,1)
plt.text(0.1, 0.9, f'Random Effects better ({re_better_percent}\\%)', transform=plt.gca().transAxes, fontsize=12, fontweight='bold', color='black', ha='left', va='top')
plt.text(0.9, 0.1, f'Birge Ratio better ({birge_better_percent}\\%)', transform=plt.gca().transAxes, fontsize=12, fontweight='bold', color='black', ha='right', va='bottom')
plt.title('Random Effects and Birge Ratio \n MLE log-likelihoods for each property')
plt.savefig('figs/pdg_loglike.pdf', bbox_inches='tight')
plt.show()
plt.hist(re_loglikes-birge_loglikes, bins=100, density=True, color='grey')
plt.axvline(0, color='black')
plt.axvline(np.mean(re_loglikes-birge_loglikes), color='red', linestyle='--')
plt.show()

In [None]:
np.mean(np.log(ns))

In [None]:
np.mean(re_loglikes == birge_loglikes)

In [None]:
np.mean(re_loglikes > birge_loglikes)

In [None]:
print(2 * len(ns) * np.log(np.sum(ns)) - 2 * np.sum(birge_loglikes))
print(2 * len(ns) * np.log(np.sum(ns)) - 2 * np.sum(re_loglikes))
print(1 * len(ns) * np.log(np.sum(ns)) - 2 * np.sum(fe_loglikes))

In [None]:
plt.hist(np.array(errscale_ps)[np.array(I2s)>0])
plt.show()
plt.hist(np.array(errscale_ps_cont)[np.array(I2s_cont)>0])
plt.show()

In [None]:
brs = np.array(brs)
taus = np.array(taus)
brs_cont = np.array(brs_cont)
taus_cont = np.array(taus_cont)

brs_big = brs[brs>1]
taus_big = taus[taus>0]
brs_cont_big = brs_cont[brs_cont>1]
taus_cont_big = taus_cont[taus_cont>0]

fig, axs = plt.subplots(1, 2, figsize=(10,5))

axs[0].hist(brs_big, range=(1, 4), bins=30, color='grey', label='PDG data')# , weights=np.ones(len(brs_big))/len(brs_big))
axs[0].hist(brs_cont_big, range=(1, 4), bins=30, color='black', histtype='step', label=f'Control experiment\n(no systematics)\n(${int(np.mean(brs_cont==1)*100)}\%=1$)')
axs[0].set_title(fr'Non-unity Birge ratios within each property (${int(np.mean(brs==1)*100)}\%=1$)')
axs[0].set_xlim(1, 4)
axs[0].set_xlabel(r'Estimated Birge ratio of a property')
axs[0].set_ylabel('Count')
axs[0].legend(frameon=False)

axs[1].hist(taus_big, range=(0,3), bins=30, color='grey', label='PDG data')# , weights=np.ones(len(taus_big))/len(taus_big))
axs[1].hist(taus_cont_big, range=(0, 3), bins=30, color='black', histtype='step', label=f'Control experiment\n(no systematics)\n(${int(np.mean(taus_cont==0)*100)}\%=0$)')
axs[1].set_title(fr'Non-zero mean ratios $\hat\tau/\sigma_i$ within each property (${int(np.mean(taus==0)*100)}\%=0$)')
axs[1].set_xlim(0, 3)
axs[1].set_xlabel(r'Mean ratio $\hat\tau/\sigma_i$ within a property')
axs[1].set_ylabel('Count')
axs[1].legend(frameon=False)

plt.savefig('figs/pdg_birge_re.pdf', bbox_inches='tight')
plt.show()

In [None]:
brs_cont

In [None]:
np.mean(np.array(brs)==1)

In [None]:
np.max(taus)

In [None]:
np.sum(np.isnan(taus))

In [None]:
np.max(taus)

In [None]:
np.argmax(taus)
dfs[1231]

In [None]:
df

In [None]:
df['pdgid.description'][df['pdgid.description'].str.contains('MASS')].unique()

In [None]:
type(df)

In [None]:
df.groups

In [None]:
df

In [None]:
df['limit'].unique()

In [None]:
df[df['std_resid'] == 0]

In [None]:
plt.hist(df['std_resid_adj'], bins=100, range=(-5, 5), density=True, color='grey', label='Standardized residuals')
# plot normal pdf
from scipy.stats import norm
x = np.linspace(-5, 5, 100)
plt.xlim(-5, 5)
plt.axvline(0, color='black', linestyle='--')
plt.plot(x, norm.pdf(x, 0, 1), color='red', label='Standard Normal PDF')
plt.title('Standardized residuals of PDG measurements')
plt.legend(frameon=False)
plt.savefig('figs/pdg_std_residuals.pdf', bbox_inches='tight')
plt.show()

In [None]:
## qq plot
import statsmodels.api as sm
sm.qqplot(df['std_resid_adj'])
plt.ylim(-7, 7)
plt.xlim(-7, 7)

In [None]:
# ks test
from scipy.stats import kstest
ks_stat, ks_pvalue = kstest(df['std_resid_adj'], 'norm')
ks_pvalue

In [None]:
n = 5
data = np.random.normal(0, 1, (100000, n))
avg = np.mean(data, axis=1)
resid = data - avg[:, None]
plt.hist(resid.flatten(), bins=100, range=(-5, 5), density=True)
plt.plot(x, norm.pdf(x, 0, np.sqrt((n-1)/n)), color='red', label='Normal PDF')

In [None]:
data

In [None]:
len(data)

In [None]:
cur.execute("SELECT * FROM pdgdata").fetchall()

In [None]:
data

In [None]:
api.editions

In [None]:
particle = api.get_particle_by_name('t')
measurement = list(particle.mass_measurements())[0]

In [None]:
dir(particle)

In [None]:
particle

In [None]:
measurement

In [None]:
con = sqlite3.connect('data/pdgall-2025-v0.2.0.sqlite')
cur = con.cursor()
command = """
SELECT pdgid.description, pdgmeasurement.pdgid, pdgdata.value_type, pdgdata.in_summary_table, pdgdata.value, pdgmeasurement_values.value, pdgmeasurement_values.error_positive, pdgmeasurement_values.error_negative
FROM pdgmeasurement_values
     JOIN pdgmeasurement ON pdgmeasurement.id = pdgmeasurement_values.pdgmeasurement_id
     JOIN pdgid ON pdgid.id = pdgmeasurement.pdgid_id
     JOIN pdgdata ON pdgdata.pdgid_id = pdgid.id
--     JOIN pdgparticle ON pdgparticle.pdgid = pdgid.parent_pdgid
WHERE pdgmeasurement_values.value IS NOT NULL AND pdgdata.edition = '2025'
"""
res = cur.execute(command)
data = res.fetchall() #WHERE 
columns = [col[0] for col in res.description]
print(len(data), 'measurements')
print(columns)
df = pd.DataFrame(data, columns=['pdgid.description', 'pdgid', 'type', 'insummary', 'avg', 'measurement', 'error_positive', 'error_negative'])
df['error'] = (df['error_positive'] + df['error_negative'])/2
df['std_resid'] = (df['measurement'] - df['avg']) / df['error']
# only keep rows where there are at least 3 measurements
df = df.groupby('pdgid').filter(lambda x: len(x) >= 3)
print('Number of properties:', len(df['pdgid'].unique()))
print('Number of measurements:', len(df))
df_gb = df.groupby('pdgid', group_keys=False)
dfs = [df_gb.get_group(x) for x in df_gb.groups]

In [None]:
ns = []
for df in dfs:
    ns.append(len(df))
biggest = np.argpartition(ns, -10)[-10:]
for idx in biggest:
    print(dfs[idx])

In [None]:
dfs[np.argmax(ns)]