In [1]:
import numpy as np
import pandas as pd
from nustattools.stats import DF, cee, cee2, OptimalFMaxStatistic
from scipy.stats import chi2

# https://arxiv.org/abs/2407.10962

In [2]:
cee2(k=DF(1, 2, 3)).pdf(1)

0.12506796951321578

In [3]:
cee(k=DF(1, 2, 3)).pdf(1)

0.2501359390297275

In [4]:
data = pd.read_csv("benchmark.csv")
combinations = {
    "T2K": ("T2K dalphat", "T2K dpt"),
    "MINERvA": ("MINERvA dalphat", "MINERvA dpt", "MINERvA p_N"),
    "MicroBooNE": ("MicroBooNE dalphat", "MicroBooNE dpt"),
    "MicroBooNE all": (
        "MicroBooNE",
        "MicroBooNE dpt low dalphat",
        "MicroBooNE dpt mid-low dalphat",
        "MicroBooNE dpt mid-high dalphat",
        "MicroBooNE dpt high dalphat",
    ),
    "T2K+MicroBooNE": ("T2K", "MicroBooNE"),
    "T2K+MicroBooNE all": ("T2K", "MicroBooNE all"),
    "all": ("T2K", "MicroBooNE all", "MINERvA"),
}
results = {}
n_bins = {}
data["RFG"]

0        NaN
1      85.55
2        NaN
3     415.29
4     549.46
5       5.33
6      31.52
7      16.06
8      16.06
9      25.32
10     20.78
Name: RFG, dtype: float64

In [5]:
def get_measurements(combo):
    measurements_list = []
    for meas in combinations[combo]:
        if meas in combinations:
            measurements_list.extend(get_measurements(meas))
        else:
            measurements_list.append(meas)
    return measurements_list


for model in data.columns:
    if model in ("Measurement", "N_bins"):
        continue
    print(model)
    res = {}
    bins = {}
    for combo in combinations:
        measurements = get_measurements(combo)

        # Get measurements that should be combined
        dat = data.loc[data["Measurement"].isin(measurements)]
        # Filter out NaNs
        dat = dat.loc[dat[model].notnull()]

        # At least 2 data points left?
        if len(dat.index) < 2:
            continue

        # Get maximum M2
        M2 = dat[model].max()
        # Get dof of all measurements
        k = DF(*dat["N_bins"].to_list())

        # Calculate and store p value
        res[combo] = cee2(k=k).sf(M2)
        bins[combo] = dat["N_bins"].to_list()

    if len(res) > 0:
        results[model] = res
        n_bins[model] = bins

SF/SF*
LFG
RFG
More 2p2h
More FSI
Less FSI
More pi abs.
Less pi abs.
GENIE


In [6]:
models = list(results.keys())

print(r"\toprule")
print(r"Measurements & " + " & ".join(models) + r" \\")
print(r"\midrule")
for comb in combinations:
    line = rf"{comb}"
    for model in models:
        if comb in results[model]:
            line += rf" & {results[model][comb]:.3f} ({len(n_bins[model][comb])})"
        else:
            line += rf" & "
    line += r" \\"
    print(line)
print(r"\bottomrule")

\toprule
Measurements & SF/SF* & LFG & RFG & More 2p2h & More FSI & Less FSI & GENIE \\
\midrule
T2K & 0.024 (2) &  &  &  & 0.009 (2) & 0.032 (2) &  \\
MINERvA & 0.000 (2) & 0.000 (2) & 0.000 (2) &  &  &  &  \\
MicroBooNE & 0.109 (2) & 0.452 (2) & 0.003 (2) &  & 0.251 (2) & 0.021 (2) &  \\
MicroBooNE all & 0.135 (6) & 0.456 (6) & 0.011 (6) & 0.421 (5) & 0.312 (6) & 0.021 (6) & 0.569 (5) \\
T2K+MicroBooNE & 0.129 (4) & 0.506 (3) & 0.000 (3) & 0.001 (2) & 0.061 (4) & 0.024 (4) & 0.808 (2) \\
T2K+MicroBooNE all & 0.140 (8) & 0.466 (7) & 0.000 (7) & 0.004 (6) & 0.200 (8) & 0.021 (8) & 0.580 (6) \\
all & 0.000 (10) & 0.000 (9) & 0.000 (9) & 0.000 (7) & 0.000 (9) & 0.000 (9) & 0.002 (7) \\
\bottomrule


In [7]:
for model in data.columns:
    if model in ("Measurement", "N_bins"):
        continue
    print(model)
    res = {}
    bins = {}
    for combo in combinations:
        measurements = get_measurements(combo)

        # Get measurements that should be combined
        dat = data.loc[data["Measurement"].isin(measurements)]
        # Filter out NaNs
        dat = dat.loc[dat[model].notnull()]

        # At least 2 data points left?
        if len(dat.index) < 2:
            continue

        # Get minimum p value
        p = []
        for n, M2 in zip(dat["N_bins"], dat[model]):
            p.append(chi2(df=n).sf(M2))
            
        p_min = np.min(p)
        
        # Calculate and store p value
        res[combo] = 1 - (1-p_min)**len(p)

    if len(res) > 0:
        results[model] = res

SF/SF*
LFG
RFG
More 2p2h
More FSI
Less FSI
More pi abs.
Less pi abs.
GENIE


In [8]:
models = list(results.keys())

print(r"\toprule")
print(r"Measurements & " + " & ".join(models) + r" \\")
print(r"\midrule")
for comb in combinations:
    line = rf"{comb}"
    for model in models:
        if comb in results[model]:
            line += rf" & {results[model][comb]:.3f} ({len(n_bins[model][comb])})"
        else:
            line += rf" & "
    line += r" \\"
    print(line)
print(r"\bottomrule")

\toprule
Measurements & SF/SF* & LFG & RFG & More 2p2h & More FSI & Less FSI & GENIE \\
\midrule
T2K & 0.024 (2) &  &  &  & 0.009 (2) & 0.032 (2) &  \\
MINERvA & 0.000 (2) & 0.000 (2) & 0.000 (2) &  &  &  &  \\
MicroBooNE & 0.034 (2) & 0.659 (2) & 0.006 (2) &  & 0.332 (2) & 0.001 (2) &  \\
MicroBooNE all & 0.099 (6) & 0.576 (6) & 0.017 (6) & 0.480 (5) & 0.414 (6) & 0.004 (6) & 0.630 (5) \\
T2K+MicroBooNE & 0.047 (4) & 0.801 (3) & 0.000 (3) & 0.000 (2) & 0.017 (4) & 0.003 (4) & 0.721 (2) \\
T2K+MicroBooNE all & 0.092 (8) & 0.633 (7) & 0.000 (7) & 0.000 (6) & 0.034 (8) & 0.005 (8) & 0.697 (6) \\
all & 0.000 (10) & 0.000 (9) & 0.000 (9) & 0.000 (7) & 0.001 (9) & 0.000 (9) & 0.013 (7) \\
\bottomrule


In [9]:
for model in data.columns:
    if model in ("Measurement", "N_bins"):
        continue
    print(model)
    res = {}
    bins = {}
    for combo in combinations:
        measurements = get_measurements(combo)

        # Get measurements that should be combined
        dat = data.loc[data["Measurement"].isin(measurements)]
        # Filter out NaNs
        dat = dat.loc[dat[model].notnull()]

        # At least 2 data points left?
        if len(dat.index) < 2:
            continue

        # Get optimal fmax statistic
        k = np.array(dat["N_bins"])
        fmax = OptimalFMaxStatistic(k=k)
            
        t_max = fmax(dat[model])
        
        # Calculate and store p value
        res[combo] = 1 - fmax.cdf(t_max)
        
    if len(res) > 0:
        results[model] = res

SF/SF*
LFG
RFG
More 2p2h
More FSI
Less FSI
More pi abs.
Less pi abs.
GENIE


In [10]:
models = list(results.keys())

print(r"\toprule")
print(r"Measurements & " + " & ".join(models) + r" \\")
print(r"\midrule")
for comb in combinations:
    line = rf"{comb}"
    for model in models:
        if comb in results[model]:
            line += rf" & {results[model][comb]:.3f} ({len(n_bins[model][comb])})"
        else:
            line += rf" & "
    line += r" \\"
    print(line)
print(r"\bottomrule")

\toprule
Measurements & SF/SF* & LFG & RFG & More 2p2h & More FSI & Less FSI & GENIE \\
\midrule
T2K & 0.024 (2) &  &  &  & 0.009 (2) & 0.032 (2) &  \\
MINERvA & 0.000 (2) & 0.000 (2) & 0.000 (2) &  &  &  &  \\
MicroBooNE & 0.038 (2) & 0.605 (2) & 0.005 (2) &  & 0.373 (2) & 0.001 (2) &  \\
MicroBooNE all & 0.114 (6) & 0.556 (6) & 0.016 (6) & 0.473 (5) & 0.398 (6) & 0.004 (6) & 0.622 (5) \\
T2K+MicroBooNE & 0.049 (4) & 0.740 (3) & 0.000 (3) & 0.000 (2) & 0.018 (4) & 0.003 (4) & 0.772 (2) \\
T2K+MicroBooNE all & 0.099 (8) & 0.605 (7) & 0.000 (7) & 0.000 (6) & 0.037 (8) & 0.006 (8) & 0.678 (6) \\
all & 0.000 (10) & 0.000 (9) & 0.000 (9) & 0.000 (7) & 0.001 (9) & 0.000 (9) & 0.011 (7) \\
\bottomrule
