In [None]:
import pdg

In [None]:
api = pdg.connect("sqlite:///data/pdgall-2024-v0.1.4.sqlite")

In [None]:
api.editions

In [None]:
masses = []
uncertainties = []
for edition in api.editions:
    api.edition = edition
    particle = api.get_particle_by_name("t")
    print(particle.pdgid)
    mass = particle.mass
    if mass is None:
        print(particle.has_mass_entry)
        print([m.summary_values() for m in particle.masses()])
        break
    masses.append(particle.mass)
    uncertainties.append(particle.mass_error)

In [None]:
len(masses)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

masses = np.array(masses)
uncertainties = np.array(uncertainties)
editions = np.array(api.editions).astype(int)[: len(masses)]
plt.plot(editions, masses, color="black")
plt.fill_between(
    editions, masses - uncertainties, masses + uncertainties, color="gray", alpha=0.5
)

In [None]:
api.edition = "2020"

In [None]:
api.get_particle_by_name("t").mass

In [None]:
print([m.summary_values() for m in api.get_particle_by_name("t").masses()])

In [None]:
import pandas as pd

dfs = pd.read_html(
    "https://pdglive.lbl.gov/DataBlock.action?node=Q007TP", encoding="ISO-8859-1"
)

In [None]:
len(dfs)

In [None]:
df = dfs[2].copy()
df = df[~pd.isna(df["DOCUMENT ID"]) & pd.isna(df["Unnamed: 6"])]
# drop all unnamed columns
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
# drop rows containing 'DOCUMENT ID'
df = df[~df["DOCUMENT ID"].str.contains("DOCUMENT ID")]
df = df[~df["DOCUMENT ID"].str.contains("References")]

In [None]:
df

In [None]:
superseded = {
    "SIRUNYAN 2018DE": "TUMASYAN 2023BB",
    "SIRUNYAN 2017L": "TUMASYAN 2021G",
    "AALTONEN 2007B": "AALTONEN 2011AK",
    "AALTONEN 2011AK": "AALTONEN 2013H",
    "ABAZOV 2011R": "ABAZOV 2012AB",
    "CHATRCHYAN 2011F": "CHATRCHYAN 2012BA",
    "AALTONEN 2010E": "AALTONEN 2012G",
    "AALTONEN 2007D": "AALTONEN 2012G",
    "ABAZOV 2006U": "ABAZOV 2008AH",
}
current = [
    "AAD 2023N",
    "TUMASYAN 2023BB",
    "TUMASYAN 2023Z",
    "TUMASYAN 2021G",
    "SIRUNYAN 2020AR",
    "AABOUD 2019AC",
    "SIRUNYAN 2019AP",
    "SIRUNYAN 2019AR",
    "KHACHATRYAN 2016AK",
    "TEVEWWG 2016",
]

In [None]:
from methods import binomial_method

In [None]:
import re


def parse_measurement(s):
    """
    Parse a measurement string of the form:
      $value$ [ $±stat$ ] [ ${}^{+stat+}_{-stat-}$ ] [ $±syst$ ] [ ${}^{+syst+}_{-syst-}$ ]
    into a dict with keys:
      'value', 'stat-', 'stat+', 'syst-', 'syst+', 'err-', 'err+'
    """
    # 1) pull out all the $...$ groups
    groups = re.findall(r"\$([^$]+)\$", s)
    if not groups:
        raise ValueError("No $...$ groups found")
    # first group is the central value
    value = float(groups[0].strip())
    err_groups = groups[1:]  # the rest are error specs

    # helper to parse one error group (symmetric or asymmetric) -> (neg, pos)
    def _parse_err(g):
        g = g.strip()
        # symmetric: ± or \pm
        m = re.search(r"[±\\]pm\s*([0-9]*\.?[0-9]+)", g)
        if m:
            v = float(m.group(1))
            return v, v
        # asymmetric: look for +num and -num
        m_plus = re.search(r"\+\s*([0-9]*\.?[0-9]+)", g)
        m_minus = re.search(r"-\s*([0-9]*\.?[0-9]+)", g)
        if m_plus and m_minus:
            return float(m_minus.group(1)), float(m_plus.group(1))
        raise ValueError(f"Could not parse error group: {g}")

    # initialize result
    res = {
        "value": value,
        "stat-": None,
        "stat+": None,
        "syst-": None,
        "syst+": None,
        "err-": None,
        "err+": None,
    }

    if len(err_groups) == 1:
        # single error → total error
        neg, pos = _parse_err(err_groups[0])
        res["err-"], res["err+"] = neg, pos
    elif len(err_groups) == 2:
        # two errors → stat then syst
        stat_neg, stat_pos = _parse_err(err_groups[0])
        syst_neg, syst_pos = _parse_err(err_groups[1])
        res["stat-"], res["stat+"] = stat_neg, stat_pos
        res["syst-"], res["syst+"] = syst_neg, syst_pos
        res["err-"] = np.sqrt(res["stat-"] ** 2 + res["syst-"] ** 2)
        res["err+"] = np.sqrt(res["stat+"] ** 2 + res["syst+"] ** 2)
    elif len(err_groups) == 0:
        # no errors at all
        pass
    else:
        # you could expand this to handle three or more error groups,
        # but based on your description you only need 1 or 2
        raise ValueError(f"Unexpected number of error groups: {len(err_groups)}")
    res["symerr"] = (res["err-"] + res["err+"]) / 2

    return res


# --- examples ---
tests = [
    "$174.41$ $\\pm0.39$ $\\pm0.71$",
    "$171.77$ $\\pm0.37$",
    "$172.13$ ${}^{+0.76}_{-0.77}$",
    "$199$ ${}^{+19}_{-21}$ $\\pm22$",
    "$174$ $\\pm10$ ${}^{+13}_{-12}$",
]

for t in tests:
    print(t, "→", parse_measurement(t))

In [None]:
def get_year(s):
    match = re.search(r"\b(19|20)\d{2}", s)
    if match:
        return int(int(match.group()))
    else:
        raise ValueError(f"No year found in string: {s}")

In [None]:
# apply parse_measurement to the 'VALUE (GeV)' column of the dataframe
parsed = list(df["VALUE (GeV)"].apply(parse_measurement))
# add these to dataframe as new columns
for key in parsed[0].keys():
    df[key] = [p[key] for p in parsed]

In [None]:
df_2024 = df[df["DOCUMENT ID"].str.contains("|".join(current))]
# take only the first row for each document
df_2024 = df_2024.drop_duplicates(subset=["DOCUMENT ID"], keep="first")
df_2024
values = np.array(df_2024["value"])
uncertainties = np.array(df_2024["symerr"])
from methods import binomial_method, random_effects_hksj, birge

l, prob = binomial_method(np.sort(values), 0.5, which="lower")
u, prob = binomial_method(np.sort(values), 0.5, which="upper")
print(l, u)
print(1 - 2 * prob)
interval, _, _, _ = random_effects_hksj(values, uncertainties, coverage=0.6827)
print(interval)
interval, _, _, c = birge(values, uncertainties, coverage=0.6827, pdg=True)
print(interval, c)

In [None]:
years = np.array(list(df["DOCUMENT ID"].apply(get_year)))
# add jitter
jitter = np.random.normal(0, 0.5, len(years))

In [None]:
values = np.array([p["value"] for p in parsed])
errors = np.array([p["symerr"] for p in parsed])
n = len(values)

In [None]:
plt.figure(figsize=(10, 6))
plt.errorbar(
    values,
    years + jitter,
    xerr=errors,
    fmt="o",
    color="black",
    capsize=3,
    linewidth=0.5,
    markersize=2,
)
plt.fill_betweenx(
    editions, masses - uncertainties, masses + uncertainties, color="red", alpha=0.5
)
# change y axis to df['DOCUMENT ID']
# plt.yticks(range(n), df['DOCUMENT ID'])
# invert y axis
plt.gca().invert_yaxis()
plt.show()

In [None]:
w = 1 / errors
wm = np.sum(w * values) / np.sum(w)
resids = values - wm
resids_norm = resids / errors

In [None]:
## qq plot
import scipy.stats as stats

plt.figure(figsize=(5, 5))
theoretical, actual = stats.probplot(resids_norm, dist="norm", fit=False)
errors_sort = np.array([errors[i] for i in np.argsort(resids_norm)])
plt.scatter(theoretical, actual, c=np.log(errors_sort))
# set aspect equal
plt.xlim([-3, 3])
plt.ylim([-3, 3])

# plot x=y
plt.plot(theoretical, theoretical, "r--", linewidth=0.5)

In [None]:
# calculate kurtosis
kurtosis = stats.kurtosis(resids_norm)
kurtosis

In [None]:
plt.hist(resids_norm[:10])

In [None]:
from scipy.stats import normaltest

normaltest(resids_norm[:100])