Let's put together the full data set. Two issues to be manually fixed here:
- for minflux mESC TSA, use rep2b instead of rep2
- for SPT, kick out movies dead cells. These are:
   + in (100ms, mESC, ctrl, rep3) cell  000
   + in (100ms, mESC, ICRF, rep3) cell  000
   + in (  2s,  mESC, ctrl, rep3) cells 000, 001, 002, 003
- remove MINFLUX U2OS stuck fraction and save separately. We do this by tagging the stuck fraction and saving a selection of the full data set using `nl.io.write.hdf5_subTaggedSet`. So the stuck fraction can be loaded from the same file, simply by using `data_withU2OSstuck` instead of `data`.

In [1]:
import numpy as np
import noctiluca as nl

# MINFLUX

In [2]:
filename = 'data/20250302_H2B_clean.h5'
tmp = nl.io.load.hdf5(filename)

data_mf    = tmp['data']
conditions = tmp['conditions']
del tmp

In [3]:
# Add single-traj mci
# this should be done first thing, because the MCIs stored in the file
# are matched to the dataset in 20250302_H2B_clean.h5.
mcis = nl.io.load.hdf5('data/20250302_single-traj_NPFit_H2B_clean.h5', '/mcis')
data_mf.makeSelection()
assert len(data_mf) == len(mcis)
for traj, mci in zip(data_mf, mcis):
    traj.meta['mci'] = mci

del mcis

In [4]:
# Adjust replicate tags for consistency
for r in range(10):
    data_mf.makeSelection(f'rep={r}')
    data_mf.addTags(f'rep{r}')

In [5]:
# Use only mESC TSA2b, not TSA2
# But then change name/tags of TSA2b to TSA2, for consistency with the rest
data_mf.makeSelection(['mESC', 'TSA2'], logic=all)
data_mf.deleteSelection()
data_mf.makeSelection(['TSA2b'])
data_mf.addTags(['rep2'])

In [6]:
# Time lag (for later)
data_mf.makeSelection()
dt_H2B = np.mean([traj.meta['dt'] for traj in data_mf])

In [7]:
# done
data_mf.makeSelection()
data_mf.addTags({'minflux', 'H2B', 'minflux-H2B'})

# MINFLUX array

In [8]:
filename = 'data/20250302_array_clean.h5'
tmp = nl.io.load.hdf5(filename)

data_array = tmp['data']
del tmp

In [9]:
# add tags
for tag, newtag in [('NT', 'C36'), ('ctrl', 'ΔRAD21 (inactive)'), ('IAA', 'ΔRAD21 (active)')]:
    data_array.makeSelection(tag)
    data_array.addTags(newtag)

In [10]:
# remove 'ctrl' tag, which is confusing anyways
data_array.makeSelection('ctrl')
for traj, tags in data_array(giveTags=True):
    tags -= {'ctrl'}

In [11]:
# Adjust replicate tags for consistency
for r in range(10):
    data_array.makeSelection(f'rep={r}')
    data_array.addTags(f'rep{r}')

In [12]:
# Time lag (for later)
data_array.makeSelection()
dt_array = np.mean([traj.meta['dt'] for traj in data_array])

In [13]:
# done
data_array.makeSelection()
data_array.addTags({'minflux', 'array', 'minflux-array', 'mESC'})

# SPT

In [14]:
filename = 'data/20250121_data_H2B.h5'
data_conv = nl.io.load.hdf5(filename, 'data_twoLocus_acyclic_3um')

In [15]:
# add useful tags
data_conv.makeSelection('DMSO', logic=any)
data_conv.addTags('ctrl')

data_conv.makeSelection('150nM-TSA')
data_conv.addTags('TSA')

for cond in conditions:
    data_conv.makeSelection(cond.split(', ')[1:], logic=all)
    data_conv.addTags(cond)

for dt_tag in ['100ms', '2s']:
    data_conv.makeSelection(dt_tag)
    data_conv.addTags(f'SPT-{dt_tag}')

In [16]:
# remove what we're not using (500nM-TSA & 1μM-TSA)
data_conv.makeSelection(conditions, logic=lambda x: not any(x))
data_conv.deleteSelection()

In [17]:
# remove dead cells
def dead_cells(traj, tags):
    dead_list = [
        ({'100ms', 'mESC', 'ctrl', 'rep3'}, ['000cell']),
        ({'100ms', 'mESC', 'ICRF', 'rep3'}, ['000cell']),
        ({  '2s',  'mESC', 'ctrl', 'rep3'}, ['000cell', '001cell', '002cell', '003cell']),
    ]
    
    for dtags, cells in dead_list:
        if len(tags & dtags) == len(dtags): # all tags contains all dtags
            filename = {tag for tag in tags if tag.startswith('file=')}.pop()
            return any(c in filename for c in cells)
    return False

data_conv.makeSelection(dead_cells)
data_conv.deleteSelection()

In [18]:
# done
data_conv.makeSelection()
data_conv.addTags({'SPT', 'H2B'})

# SPT (Fbn2)

In [19]:
filename = 'data/20250411_data_SPT-Fbn2-CTCF.h5'
data_conv_Fbn2 = nl.io.load.hdf5(filename, 'data_twoLocus_maxsep-3um')

In [20]:
# add useful tags
data_conv_Fbn2.makeSelection('DMSO')
data_conv_Fbn2.addTags('ΔRAD21 (inactive)')

data_conv_Fbn2.makeSelection('IAA')
data_conv_Fbn2.addTags('ΔRAD21 (active)')

for dt_tag in ['100ms', '2s']:
    data_conv_Fbn2.makeSelection(dt_tag)
    data_conv_Fbn2.addTags(f'SPT-{dt_tag}')

In [21]:
# done
data_conv_Fbn2.makeSelection()
data_conv_Fbn2.addTags({'SPT', 'array', 'mESC'})

# Fbn2

In [22]:
data_fbn2 = nl.io.load.hdf5('data/2022_Fbn2_full_data.h5')['data']

In [23]:
# Add a few tags
data_fbn2.makeSelection('Rad21_0_hr')
data_fbn2.addTags('ΔRAD21 (inactive)')

data_fbn2.makeSelection(['Rad21_2_hr', 'Rad21_4_hr'], logic=any)
data_fbn2.addTags('ΔRAD21 (active)')

data_fbn2.makeSelection(['C36', 'ΔRAD21 (inactive)', 'ΔRAD21 (active)'], logic=lambda x: not any(x))
data_fbn2.deleteSelection()

In [24]:
# done
data_fbn2.makeSelection()
data_fbn2.addTags({'SRLCI', 'array', 'mESC', 'rep1'})

# Merge all together

In [25]:
data = nl.TaggedSet()
data |= data_mf
data |= data_array
data |= data_conv
data |= data_conv_Fbn2
data |= data_fbn2

In [26]:
# Add time lags to everything
time_lags = {
    'minflux-H2B'   : dt_H2B,
    'minflux-array' : dt_array,
    'SPT-100ms'     : 0.1,
    'SPT-2s'        : 2,
    'SRLCI'         : 20,
}
for tag, dt in time_lags.items():
    data.makeSelection(tag)
    for traj in data:
        traj.meta['Δt'] = dt

In [27]:
# Remove unnecessary tags
data.makeSelection()
useful_tags = {
    'U2OS', 'mESC', 'H2B', 'array', 'minflux', 'SPT', 'SRLCI',
    'ctrl', 'DMSO', 'DRB', 'TSA', 'ICRF',
    'minflux-H2B', 'minflux-array', 'SPT-100ms', 'SPT-2s', '100ms', '2s',
    *conditions,
    'C36', 'ΔRAD21 (inactive)', 'ΔRAD21 (active)',
    'rep1', 'rep2', 'rep3',
} | {tag for tag in data.tagset() if tag.startswith('file=')}

for traj, tags in data(giveTags=True):
    tags &= useful_tags

In [28]:
# Add "stuck" tag to minflux U2OS
data.makeSelection(['minflux', 'U2OS'], logic=all)
data.refineSelection(lambda traj, _: traj.meta['mci']['α (dim 0)'][1][1] < 0.1)
data.addTags('stuck')

In [29]:
# Save
# raise NotImplementedError # overwrite protection
outfile = 'data/20250411_Hansenlab_chromatin_dynamics_all-data.h5'
data.makeSelection()
nl.io.write.hdf5({
    'data_withU2OSstuck' : data,
    'H2B_conditions' : conditions,
    'array_conditions' : ['C36', 'ΔRAD21 (inactive)', 'ΔRAD21 (active)'],
    'description' : """
Joint file containing all our chromatin dynamics data

The production dataset is stored in 'data'; 'data_withU2OSstuck' is a version of
that which also contains the "stuck" trajectories of minflux U2OS (labelled as such)

Tags:
 - 'U2OS', 'mESC'
 - 'H2B', 'array'
 - 'minflux', 'SPT', 'SRLCI'
 - 'ctrl', 'DMSO', 'DRB', 'TSA', 'ICRF' ('ctrl' and 'DMSO' are identical)
 - 'minflux-H2B', 'minflux-array'
 - 'SPT-100ms', 'SPT-2s', '100ms', '2s'
 - all combinations: "H2B, {'U2OS', 'mESC'}, {'ctrl', 'DRB', 'TSA', 'ICRF'}"
 - 'C36', 'ΔRAD21 (inactive)', 'ΔRAD21 (active)'
 - 'rep1', 'rep2', 'rep3' (except SRLCI data)
 - 'file=<original filename>'
 - 'stuck' indicates low-α trajectories in minflux U2OS data
   (only applicable when loading 'data_withU2OSstuck')
 
Notes:
 - minflux data is true single particle tracking, in 2D.
 - SPT H2B data is two-locus, in 2D; inter-locus distance is constrained to <3μm.
   The trajectories contain the full 2-locus data, but have MSDs for relative distance pre-calculated.
 - SPT array data is two-locus, with CTCF as reference; inter-locus distance is <3μm.
 - SRLCI data is relative position of two loci, in 3D.
"""[1:-1],
}, outfile)

In [30]:
data.makeSelection('stuck', logic=lambda x: not any(x))
nl.io.write.hdf5_subTaggedSet(data,
                              filename=outfile,
                              group='/data',
                              refTaggedSet='/data_withU2OSstuck',
                             )

# Sanity checks

In [31]:
conditions = [
    ['H2B', 'minflux', 'mESC', 'ctrl'],
    ['H2B', 'minflux', 'mESC', 'DRB'],
    ['H2B', 'minflux', 'mESC', 'TSA'],
    ['H2B', 'minflux', 'mESC', 'ICRF'],
    ['H2B', 'minflux', 'U2OS', 'ctrl'],
    ['H2B', 'minflux', 'U2OS', 'DRB'],
    ['H2B', 'minflux', 'U2OS', 'TSA'],
    ['H2B', 'minflux', 'U2OS', 'ICRF'],
    ['H2B', 'SPT-100ms', 'mESC', 'ctrl'],
    ['H2B', 'SPT-100ms', 'mESC', 'DRB'],
    ['H2B', 'SPT-100ms', 'mESC', 'TSA'],
    ['H2B', 'SPT-100ms', 'mESC', 'ICRF'],
    ['H2B', 'SPT-100ms', 'U2OS', 'ctrl'],
    ['H2B', 'SPT-100ms', 'U2OS', 'DRB'],
    ['H2B', 'SPT-100ms', 'U2OS', 'TSA'],
    ['H2B', 'SPT-100ms', 'U2OS', 'ICRF'],
    ['H2B', 'SPT-2s', 'mESC', 'ctrl'],
    ['H2B', 'SPT-2s', 'mESC', 'DRB'],
    ['H2B', 'SPT-2s', 'mESC', 'TSA'],
    ['H2B', 'SPT-2s', 'mESC', 'ICRF'],
    ['H2B', 'SPT-2s', 'U2OS', 'ctrl'],
    ['H2B', 'SPT-2s', 'U2OS', 'DRB'],
    ['H2B', 'SPT-2s', 'U2OS', 'TSA'],
    ['H2B', 'SPT-2s', 'U2OS', 'ICRF'],
    ['array', 'minflux', 'C36'],
    ['array', 'minflux', 'ΔRAD21 (inactive)'],
    ['array', 'minflux', 'ΔRAD21 (active)'],
    ['array', 'SPT-100ms', 'C36'],
    ['array', 'SPT-100ms', 'ΔRAD21 (inactive)'],
    ['array', 'SPT-100ms', 'ΔRAD21 (active)'],
    ['array', 'SPT-2s', 'C36'],
    ['array', 'SPT-2s', 'ΔRAD21 (inactive)'],
    ['array', 'SPT-2s', 'ΔRAD21 (active)'],
    ['array', 'SRLCI', 'C36'],
    ['array', 'SRLCI', 'ΔRAD21 (inactive)'],
    ['array', 'SRLCI', 'ΔRAD21 (active)'],
]

In [32]:
# Trajectories are uniquely associated with one of the above conditions
data.makeSelection()
success = True
for _, tags in data(giveTags=True):
    cnt = np.sum([all(tag in tags for tag in cond) for cond in conditions])
    if cnt != 1:
        success = False
        print(cnt, tags)

if success:
    print('All trajectories uniquely associated with one condition')

# Same check for reps within conditions
for cond in conditions:
    data.makeSelection(cond, logic=all)
    for _, tags in data(giveTags=True):
        cnt = len(tags & {'rep1', 'rep2', 'rep3'})
        if cnt != 1:
            success = False
            print(cnt, tags)

if success:
    print('All trajectories uniquely associated with one repeat within each condition')

All trajectories uniquely associated with one condition
All trajectories uniquely associated with one repeat within each condition


In [33]:
# Check repeats
for cond in conditions:
    data.makeSelection(cond, logic=all)
    reptags = sorted({tag for tag in data.tagset() if tag.startswith('rep')})
    
    if len(reptags) == 0:
        lens = [len(data)]
    else:
        lens = []
        for tag in reptags:
            data.makeSelection(cond, logic=all)
            data.refineSelection(tag)
            lens.append(len(data))
    
    # print(f'{str(cond):<45s}, {str(reptags):<30s}, {str(lens):>20s}')
    print(f'{str(cond):<45s}, {str(lens):>20s}')

['H2B', 'minflux', 'mESC', 'ctrl']           ,      [180, 309, 177]
['H2B', 'minflux', 'mESC', 'DRB']            ,      [218, 150, 157]
['H2B', 'minflux', 'mESC', 'TSA']            ,      [159, 180, 146]
['H2B', 'minflux', 'mESC', 'ICRF']           ,      [162, 233, 190]
['H2B', 'minflux', 'U2OS', 'ctrl']           ,      [537, 484, 496]
['H2B', 'minflux', 'U2OS', 'DRB']            ,      [465, 395, 443]
['H2B', 'minflux', 'U2OS', 'TSA']            ,      [582, 305, 486]
['H2B', 'minflux', 'U2OS', 'ICRF']           ,      [487, 410, 498]
['H2B', 'SPT-100ms', 'mESC', 'ctrl']         ,      [681, 875, 770]
['H2B', 'SPT-100ms', 'mESC', 'DRB']          ,      [541, 940, 933]
['H2B', 'SPT-100ms', 'mESC', 'TSA']          ,    [665, 1159, 1200]
['H2B', 'SPT-100ms', 'mESC', 'ICRF']         ,     [812, 1162, 746]
['H2B', 'SPT-100ms', 'U2OS', 'ctrl']         ,     [376, 706, 1037]
['H2B', 'SPT-100ms', 'U2OS', 'DRB']          ,     [720, 1197, 280]
['H2B', 'SPT-100ms', 'U2OS', 'TSA']          ,  