In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1'

from tqdm.auto import tqdm
from pathlib import Path
import re
from multiprocessing import Pool

import numpy as np
from scipy.sparse import csgraph, csr_matrix
import xml.etree.ElementTree as et

import noctiluca as nl

In [2]:
def loadXML(file, root='.'):
    try:
        tree = et.parse(file);
    except OSError:
        print(f'Failed to read XML file {file}.')
        raise
    
    data = nl.TaggedSet()
    
    tracks = tree.getroot()
    assert tracks.tag == 'Tracks'
    
    for particle in tracks:
        if not particle.tag == 'particle':
            continue
        
        frame  = np.array( [  int(spot.attrib['t']) for spot in particle  ])
        xy     = np.array([[float(spot.attrib[key]) for key  in ['x', 'y']]
                           for spot in particle])
        
        t = frame - np.min(frame)
        
        trajdata = np.empty((np.max(t)+1, 2), dtype=float)
        trajdata[t] = xy
        
        traj = nl.Trajectory(trajdata)
        traj.meta['real frame'] = np.arange(np.min(frame), np.max(frame)+1)
        
        data.add(traj)
    
    # get tags
    tags = set()
    tags.add(f'file={str(file.relative_to(root))}')
    
    for search, m_name in [('_(rep\d)/', 'rep'),
                           ('(U2OS|mESC)', 'cell line'),
                          ]:
        m = re.search(search, str(file))
        if m is None:
            print(f"Warning: could not identify which {m_name} the file '{str(file)}' belongs to")
        else:
            tags.add(m[1])
    
    m = re.match('Tracks_([^_]*)_([^_]*)_([^_]*)_([^_]*)_([^_]*)_\d+.xml', file.name)
    if m is None:
        raise RuntimeError(f"Filename {file.name} did not match expected pattern")
    else:
        date      = m[1]
        cellline  = m[2]
        condition = m[3]
        framerate = m[4]
        cell      = m[5]
        
        # Fix inconsistencies etc.
        if cellline  == 'U20S'    : cellline  = 'U2OS'
        if cellline  == 'mESCs'   : cellline  = 'mESC'
        if condition == '1uM-TSA' : condition = '1μM-TSA'
        
        tags |= {f'date={date}', condition, framerate, cellline}
    
    data.addTags(tags)
    return data

In [4]:
datapath = Path('../../raw_data/SPT/H2B')
files = list(datapath.rglob('*.xml'))
len(files)

1398

In [5]:
# Load all
# Files double as cell identifiers
data = nl.TaggedSet()
for file in tqdm(files):
    data |= loadXML(file, datapath)

  0%|          | 0/1398 [00:00<?, ?it/s]

In [6]:
sorted(filter(lambda tag: not tag.startswith('file='), data.tagset()))

['100ms',
 '150nM-TSA',
 '1μM-TSA',
 '2s',
 '500nM-TSA',
 'DMSO',
 'DRB',
 'ICRF',
 'U2OS',
 'date=20241207',
 'date=20241208',
 'date=20241209',
 'date=20241213',
 'date=20250116',
 'date=20250118',
 'mESC',
 'rep1',
 'rep2',
 'rep3']

In [7]:
# Precompute MSDs
with nl.Parallelize():
    _ = nl.analysis.MSD(data, chunksize=1, show_progress=True)

  0%|          | 0/103281 [00:00<?, ?it/s]

# Get (all) pairwise trajectories

In [8]:
def SD(xm, xn): # for MSD calculation
    return np.sum((xm-xn)**2, axis=-1)

def parfun(file_tag):
    data.makeSelection(file_tag)
    data_out = nl.TaggedSet()

    for i, (traj0, tags) in enumerate(data(giveTags=True)):
        rf0 = traj0.meta['real frame']
        rf0 = rf0[~np.any(np.isnan(traj0[:]), axis=-1)]
        for j in range(i+1, len(data)):
            traj1 = data[j]
            rf1 = traj1.meta['real frame']
            rf1 = rf1[~np.any(np.isnan(traj1[:]), axis=-1)]

            rf_min = min(rf0.min(), rf1.min())
            rf_max = max(rf0.max(), rf1.max())
            cnt = np.zeros(rf_max+1-rf_min, dtype=int)
            cnt[rf0-rf_min] += 1
            cnt[rf1-rf_min] += 2
            
            valid = np.nonzero(cnt == 3)[0]
            if len(valid) < 20: # the original data is also cut to >= 20 valid frames
                continue
                
            valid_rf = valid + rf_min
                
            traj_dat = np.empty((2, valid_rf[-1]+1-valid_rf[0], traj0.d), dtype=float)
            traj_dat[:] = np.nan
            traj_dat[0, valid_rf - valid_rf[0]] = traj0[valid_rf - rf0.min()]
            traj_dat[1, valid_rf - valid_rf[0]] = traj1[valid_rf - rf1.min()]
            
            traj = nl.Trajectory(traj_dat)
            traj.meta['real frame'] = np.arange(valid_rf[0], valid_rf[-1]+1)
            traj.meta['original data index 0'] = i
            traj.meta['original data index 1'] = j
            
            # Precompute MSDs
            # Note the use of preproc=... to calculate the correct MSD for the two-locus trajectories
            # This will allow us to naively use nl.analysis.MSD in the future
            # Do this here (instead of later), because it works nicely with parallelization
            _ = nl.analysis.p2.P2(traj, function=SD, writeto='MSD', preproc=lambda traj: traj.relative())
            
            data_out.add(traj, tags.copy())
            
    return data_out

data.makeSelection()
file_tags = {tag for tag in data.tagset() if tag.startswith('file=')}

data2 = nl.TaggedSet()
todo = file_tags
with Pool() as mypool:
    imap = mypool.imap_unordered(parfun, todo)
    imap = tqdm(imap, total=len(todo))
    for dat in imap:
        data2 |= dat

print(len(data2))

  0%|          | 0/1397 [00:00<?, ?it/s]

720398


# Remove cycles from two locus data

In [9]:
seps = [1, 3, 5] # μm
for max_mean_separation in seps:
    sep_tag = f'acyclic <{max_mean_separation}μm'
    
    for file_tag in tqdm(file_tags):
        data2.makeSelection(file_tag)
        # Restrict to loci that are close in space
        data2.refineSelection(lambda traj, _ : np.mean(np.linalg.norm(traj.relative()[:], axis=-1)) <= max_mean_separation)
        if len(data2) == 0:
            continue

        # Assemble overlap matrix
        rows = np.array([traj.meta['original data index 0'] for traj in data2])
        cols = np.array([traj.meta['original data index 1'] for traj in data2])
        overlaps = np.array([traj.relative().F              for traj in data2])

        # Get maximal spanning tree
        # (as minimal spanning tree of the negative overlaps)
        N = np.max(np.concatenate([rows, cols]))+1
        G = csr_matrix((-overlaps, (rows, cols)), shape=(N, N))
        H = csgraph.minimum_spanning_tree(G)

        # This is a bit hacky, maybe should be made easier in noctiluca
        for (_, tags), i, j in zip(data2(giveTags=True), rows, cols):
            if H[i, j] < 0:
                tags.add(sep_tag)
    
    data2.makeSelection(sep_tag)
    print(f'{sep_tag} : {len(data2):8>d} trajectories')
data2.makeSelection()

  0%|          | 0/1397 [00:00<?, ?it/s]

acyclic <1μm : 5382 trajectories


  0%|          | 0/1397 [00:00<?, ?it/s]

acyclic <3μm : 50238 trajectories


  0%|          | 0/1397 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [10]:
out_filename = '../../data/20250121_SPT_H2B.h5'

data.makeSelection()
nl.io.write.hdf5({
    'data' : data,
    'conditions' : ['DMSO', 'DRB', '150nM-TSA', '500nM-TSA', '1μM-TSA', 'ICRF'],
    'cell lines' : ['U2OS', 'mESC'],
    'exposure_seconds' : {'100ms' : 0.08671, '2s' : 0.08671},
    'description' : """
H2B SPT data

Tracking H2B in U2OS/mESC cells, to be supplemented with MINFLUX. Conditions are

    DMSO   DMSO-only control
    DRB    txn inhibition
    TSA    hyperacetylation (150nM, 500nM, 1μM; last only for mESC)
    ICRF   topo-II inhibition
    
Framerates are 100ms or 2s.

For all conditions we have three biological replicates, tagged 'rep1/2/3'.
"""[1:-1]}, out_filename)

data2.makeSelection()
nl.io.write.hdf5(data2, out_filename, '/data_twoLocus_all')
nl.io.write.hdf5(nl.io.load.hdf5(out_filename, 'description') + r"""

The group 'data_twoLocus_all' contains the full pairwise data set, i.e. all
two-locus trajectories within a given movie. Note that this is *highly*
redundant and is saved here mostly for completeness. For processing, use
one of the acyclic data sets below.
"""[1:], out_filename, 'description')

sep_tags = {tag for tag in data2.tagset() if tag.startswith('acyclic')}
for sep_tag in sep_tags:
    sep = int(re.search('<(\d)+μm', sep_tag)[1])
    data2.makeSelection(sep_tag)
    nl.io.write.hdf5_subTaggedSet(data2, out_filename,
                                  f'/data_twoLocus_acyclic_{sep}um',
                                  refTaggedSet='/data_twoLocus_all',
                                 )
nl.io.write.hdf5(nl.io.load.hdf5(out_filename, 'description') + r"""

The groups 'data_twoLocus_acyclic_Xum' are subsets of the two-locus data; the idea
is to make the neighbor graph acyclic, which exactly removes redundancy (e.g. with
loci 1, 2, 3, only two of the three two-locus trajectories 1--2, 2--3, 3--1 are
independent). Clearly we get some freedom of choice in which trajectories to kick
out when removing cycles; so we can score trajectories by some metric and choose
the best ones. We generally want to keep longer trajectories over shorter ones; in
addition, we favor trajectories that are closer together (the whole idea of this
two-locus data set is to remove large scale motion, so we want to be local). In
the present case, we assembled three data sets, with distance cutoffs 1, 3, 5 μm
as indicated in the group name. Choose whichever seems most useful/reasonable.
"""[1:], out_filename, 'description')