In [1]:
from omegaconf import OmegaConf
from plyfile import PlyData
import os
import json
import numpy as np

from data.fwf_dataset import FwfDataset

# assemble config
cfg = OmegaConf.load('./config/default.yaml')
cfg = OmegaConf.merge(cfg, OmegaConf.load(cfg.data.split))
with open(os.path.join(cfg.data.dataset_root, 'class_dict.json'),'r') as f:
    cfg = OmegaConf.merge(cfg, OmegaConf.create({'data':{'label_schema':json.load(f)}}))

# overwrite to load all labels
cfg.data.label_names = [f'labels_{i}' for i in range(4)]

train_ds = FwfDataset(cfg, cfg.data.preprocessing._transformsTraining_, cfg.data._trainProjects_)
val_ds = FwfDataset(cfg, cfg.data.preprocessing._transformsValidation_, cfg.data._valProjects_)


Loading '2024-03-22_FW_Koenigshuegel.FwfProj'; Bounding box IDs = default
Loading '2024-04-05_FW_Westbahnhof_02.FwfProj'; Bounding box IDs = default
Loading '2024-04-05_FW_Westbahnhof_03.FwfProj'; Bounding box IDs = default
Loading '2024-04-05_FW_Westbahnhof_04.FwfProj'; Bounding box IDs = default
Loading '2024-04-05_FW_Westbahnhof_05.FwfProj'; Bounding box IDs = default
Loading '2024-05-10_FW_RWTH_Zentrum_01.FwfProj'; Bounding box IDs = default
Loading '2024-07-31_FW_Bruecke_Koenigstr.FwfProj'; Bounding box IDs = [0, 2]
Loading '2024-08-02_FW_Bruecke_A44_VerlautenheidenerStr.FwfProj'; Bounding box IDs = default
Loading '2024-08-02_FW_Bruecke_Deltourserb.FwfProj'; Bounding box IDs = default
Loading '2024-08-02_FW_Bruecke_Kasinostrasse.FwfProj'; Bounding box IDs = [1]
Loading '2024-08-02_FW_Bruecke_RotheErde.FwfProj'; Bounding box IDs = default
Loading '2024-08-02_FW_Bruecke_Rottstrasse.FwfProj'; Bounding box IDs = default
Loading '2023-08-28_FW_EingangBauing.FwfProj'; Bounding box IDs 

In [47]:
statistics = dict( # dataset -> level -> class
    train_ds = dict(),
    val_ds = dict()
)
# for every dataset
for dataset_name in statistics.keys():
    dataset = locals().get(dataset_name)
    # for every level
    for i, level in enumerate(cfg.data.label_names):
        num_classes = len(cfg.data.label_schema[level])
        identity = np.eye(num_classes,dtype=int)
        counter = np.zeros(shape=(num_classes),dtype=int)
        for project in dataset.projects:
            # count labels in this project
            counter += identity[project['labels'][:,i]].sum(axis=0)
        statistics[dataset_name][level] = [int(c) for c in counter]

# gather complete statistics over both datasets
full = dict()
for i, level in enumerate(cfg.data.label_names):
    counts = []
    for dataset_name in statistics.keys():
        counts.append(statistics[dataset_name][level])
    full[level] = [int(sum(pair)) for pair in zip(*counts)]

# add to dict
statistics.update(full = full)

with open(os.path.join(cfg.data.dataset_root, 'class_weights.json'),'w') as f:
    json.dump(statistics, f, indent=3)


In [3]:
statistics

{'train_ds': {'labels_0': [73641615, 15583841, 1368776],
  'labels_1': [22744,
   46910754,
   1364776,
   1237165,
   23916116,
   2250175,
   14346668,
   545834],
  'labels_2': [1443607,
   159141,
   370931,
   681130,
   45956569,
   450222,
   1309643,
   517010,
   1237173,
   15668918,
   653492,
   15762,
   67791,
   175490,
   107384,
   7422802,
   10499,
   14346668],
  'labels_3': [2192166,
   13521951,
   6197361,
   149414,
   35502219,
   440880,
   2268701,
   8351338,
   4710122,
   1681201,
   584644,
   14994235]},
 'val_ds': {'labels_0': [32561170, 7343657, 489903],
  'labels_1': [11606,
   20676566,
   489871,
   310136,
   10201192,
   1583999,
   7033517,
   87843],
  'labels_2': [671753,
   0,
   66923,
   339968,
   20030569,
   335269,
   838595,
   296660,
   310140,
   5667618,
   460414,
   11893,
   43931,
   80383,
   159011,
   4034836,
   13250,
   7033517],
  'labels_3': [1225311,
   5146020,
   726378,
   72820,
   13263239,
   174486,
   2050733,
 

In [4]:
train_ds.projects[0]['labels']

array([[ 1,  6, 17, 11],
       [ 1,  6, 17, 11],
       [ 1,  6, 17, 11],
       ...,
       [ 0,  4, 15,  4],
       [ 0,  4, 15,  4],
       [ 0,  4, 15,  4]], dtype=uint8)

In [50]:
from copy import deepcopy
class_weights = deepcopy(statistics)
for ds in class_weights.keys():
    for level in class_weights[ds].keys():
        counts = class_weights[ds][level]
        class_weights[ds][level] = np.array([(1/(n_k+1))/sum([1/(n_j+1) for n_j in counts]) for n_k in counts])


In [17]:
train_ds.projects[0]['labels'].shape

(870520, 4)

In [56]:
print(class_weights)

{'train_ds': {'labels_0': array([0.01679923, 0.07938494, 0.90381583]), 'labels_1': array([9.17557495e-01, 4.44884019e-04, 1.52917621e-02, 1.68690743e-02,
       8.72626824e-04, 9.27476127e-03, 1.45468228e-03, 3.82347142e-02]), 'labels_2': array([3.34436330e-03, 3.03373692e-02, 1.30157269e-02, 7.08813666e-03,
       1.05054612e-04, 1.07234628e-02, 3.68645953e-03, 9.33819515e-03,
       3.90240145e-03, 3.08122699e-04, 7.38791328e-03, 3.06283678e-01,
       7.12170995e-02, 2.75110952e-02, 4.49592551e-02, 6.50421359e-04,
       4.59804725e-01, 3.36520597e-04]), 'labels_3': array([0.03556773, 0.00576621, 0.01258122, 0.52183787, 0.00219621,
       0.17685136, 0.03436785, 0.00933628, 0.01655379, 0.04637777,
       0.13336367, 0.00520003])}, 'val_ds': {'labels_0': array([0.01390853, 0.06166928, 0.92442218]), 'labels_1': array([8.30614205e-01, 4.66273685e-04, 1.96805269e-02, 3.10860655e-02,
       9.45079569e-04, 6.08645144e-03, 1.37071364e-03, 1.09750684e-01]), 'labels_2': array([1.48829198e-0

In [61]:
kind = 'full'

proj = train_ds.projects[0]
weights = []
for li, label_name in enumerate(cfg.data.label_names):
    labels = proj['labels'][:,li]
    weights_level = np.array(class_weights[kind][label_name][labels],dtype=np.float64)
    weights_level /= weights_level.sum()
    weights.append(weights_level[:, None])
weights = np.concatenate(weights,axis=-1).mean(axis=-1)