In [13]:
import h5py
from collections import Counter

H5_PATH = "../data/cpc_snn_train.h5"

h5 = h5py.File(H5_PATH, "r")

labels = []

for gid in h5.keys():
    grp = h5[gid]

    if "label" not in grp:
        continue

    label_value = grp["label"][()]  # scalar dataset
    labels.append(int(label_value))

print("Pozytywy:", labels.count(1))
print("Negatywy:", labels.count(0))


Pozytywy: 243
Negatywy: 2999


In [14]:
import numpy as np

def is_valid(gid):
    grp = h5[gid]

    required = ["H1", "L1", "f", "label"]

    for r in required:
        if r not in grp:
            return False

    for ifo in ["H1","L1"]:
        for sub in ["cos","sin","mag"]:
            if sub not in grp[ifo]:
                return False

    return True

valid = [gid for gid in h5.keys() if is_valid(gid)]
invalid = [gid for gid in h5.keys() if not is_valid(gid)]

print("Valid samples:", len(valid))
print("Invalid samples:", len(invalid))


Valid samples: 3242
Invalid samples: 0


In [15]:
c = Counter(labels)
print("Statystyki:")
print("  pozytywy:", c[1])
print("  negatywy:", c[0])
print("  brak label:", c[-1])
print("  uszkodzone sample:", len(invalid))


Statystyki:
  pozytywy: 243
  negatywy: 2999
  brak label: 0
  uszkodzone sample: 0


In [16]:
print("Lista uszkodzonych (brak detektorów):")
invalid[:20]   # tylko pierwsze 20 na ekran


Lista uszkodzonych (brak detektorów):


[]

In [17]:
gid = sorted(h5.keys())[0]
grp = h5[gid]

print("▶ GID:", gid)
print("Children:", list(grp.keys()))
print("\nAtrybuty:")
for k,v in grp.attrs.items():
    print(k, "=", v)

# Głębsza analiza
for child in grp.keys():
    print("\n---", child, "---")
    sub = grp[child]
    print("type:", type(sub))
    if isinstance(sub, h5py.Dataset):
        print("shape:", sub.shape, "dtype:", sub.dtype)
    else:
        print("subchildren:", list(sub.keys()))


▶ GID: 000001
Children: ['H1', 'L1', 'f', 'label', 'mask_H1', 'mask_L1']

Atrybuty:

--- H1 ---
type: <class 'h5py._hl.group.Group'>
subchildren: ['cos', 'mag', 'sin']

--- L1 ---
type: <class 'h5py._hl.group.Group'>
subchildren: ['cos', 'mag', 'sin']

--- f ---
type: <class 'h5py._hl.dataset.Dataset'>
shape: (124,) dtype: float32

--- label ---
type: <class 'h5py._hl.dataset.Dataset'>
shape: () dtype: float32

--- mask_H1 ---
type: <class 'h5py._hl.dataset.Dataset'>
shape: (7,) dtype: float32

--- mask_L1 ---
type: <class 'h5py._hl.dataset.Dataset'>
shape: (7,) dtype: float32


In [18]:
def is_valid_sample(grp):
    required_toplevel = ["H1", "L1", "f", "label"]
    for key in required_toplevel:
        if key not in grp:
            return False

    for ifo in ["H1", "L1"]:
        for sub in ["cos", "sin", "mag"]:
            if sub not in grp[ifo]:
                return False

    return True

valid = []
invalid = []

for gid in sorted(h5.keys()):
    grp = h5[gid]
    if is_valid_sample(grp):
        valid.append(gid)
    else:
        invalid.append(gid)

print("Valid samples:", len(valid))
print("Invalid samples:", len(invalid))
print("Invalid list:", invalid[:50])


Valid samples: 3242
Invalid samples: 0
Invalid list: []


In [19]:
pos = 0
neg = 0
nolabel = 0

for gid in valid:
    label = h5[gid]["label"][()]
    if label == 1:
        pos += 1
    elif label == 0:
        neg += 1
    else:
        nolabel += 1

print("Pozytywy:", pos)
print("Negatywy:", neg)
print("Brak label:", nolabel)


Pozytywy: 243
Negatywy: 2999
Brak label: 0
