In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1'

from tqdm.auto import tqdm
from pathlib import Path
import re
from multiprocessing import Pool
import itertools

import numpy as np
from scipy.sparse import csgraph, csr_matrix
import xml.etree.ElementTree as et

import noctiluca as nl

In [2]:
def coord(s, digit_cutoff=5):
    """
    Convert string to float, nan'ing out everything with low precision
    """
    digits = len(s.split('.')[-1])
    return float(s) if digits > digit_cutoff else np.nan

def loadXML(file, root='.'):
    try:
        tree = et.parse(file);
    except OSError:
        print(f'Failed to read XML file {file}.')
        raise
    
    data = nl.TaggedSet()
    
    tracks = tree.getroot()
    assert tracks.tag == 'Tracks'
    
    for particle in tracks:
        if not particle.tag == 'particle':
            continue
        
        frame  = np.array( [  int(spot.attrib['t']) for spot in particle  ])
        xy     = np.array([[coord(spot.attrib[key]) for key  in ['x', 'y']]
                           for spot in particle])
        
        ind   = ~np.any(np.isnan(xy), axis=-1)
        if np.sum(ind) == 0:
            continue
        frame = frame[ind]
        xy    = xy[ind]
        t     = frame - np.min(frame)
        
        trajdata = np.empty((np.max(t)+1, 2), dtype=float)
        trajdata[:] = np.nan
        trajdata[t] = xy
        
        traj = nl.Trajectory(trajdata)
        traj.meta['real frame'] = np.arange(np.min(frame), np.max(frame)+1)
        
        data.add(traj)
    
    # get tags
    tags = set()
    filetag = str(file.relative_to(root))
    filetag = re.sub(r'_rep_(\d)', r'_rep\1', filetag) # bugfix
    tags.add(f'file={filetag}')
    
    for search, m_name in [('_(rep\d)/', 'rep'),
                          ]:
        m = re.search(search, str(file))
        if m is None:
            print(f"Warning: could not identify which {m_name} the file '{str(file)}' belongs to")
        else:
            tags.add(m[1])
    
    m = re.match('Tracks_([^_]*)_([^_]*)_([^_]*)_([^_]*)_([^_]*)_(\d+)_(CTCF|Fbn2).xml', file.name)
    if m is None:
        raise RuntimeError(f"Filename {file.name} did not match expected pattern")
    else:
        date      = m[1]
        cellline  = m[2]
        condition = m[3]
        framerate = m[4]
        cell      = m[5]
        roi       = m[6]
        channel   = m[7]
        
        tags |= {f'date={date}', cell, f'roi={roi}', condition, framerate, cellline, channel}
    
    data.addTags(tags)
    return data

In [3]:
datapath = Path('../../raw_data/SPT/array')
files = list(datapath.rglob('*.xml'))
len(files)

742

In [4]:
# Load all
# Files double as cell identifiers
data = nl.TaggedSet()
for file in tqdm(files):
    data |= loadXML(file, datapath)

  0%|          | 0/742 [00:00<?, ?it/s]

In [5]:
# Assign situation IDs
# each recording gets a unique integer ID that allows us to match Fbn2 and CTCF data
data.makeSelection()
tags = set(filter(lambda tag: not tag.startswith('file='), data.tagset()))
reptags = sorted(tag for tag in tags if re.match('rep\d', tag))
framerates = ['100ms', '2s']
treatments = ['NT', 'DMSO', 'IAA']
cells = sorted(tag for tag in tags if re.match('\d+cell', tag))
rois = sorted(tag for tag in tags if tag.startswith('roi='))

ID = 0
bar = tqdm()
for sel in itertools.product(reptags, framerates, treatments, cells, rois):
    sel = list(sel)
    data.makeSelection(sel, logic=all)
    data.refineSelection('CTCF')
    if len(data) > 0:
        filetags = sorted(filter(lambda tag: tag.startswith('file='), data.tagset()))
        assert len(filetags) == 1
        CTCF_file = filetags.pop()
        
        data.makeSelection(sel, logic=all)
        data.refineSelection('Fbn2')
        filetags = sorted(filter(lambda tag: tag.startswith('file='), data.tagset()))
        if len(filetags) != 1:
            for filetag in filetags:
                print(filetag)
            raise RuntimeError(f"Found the above files corresponding to {CTCF_file}")
        
        data.makeSelection(sel, logic=all)
        data.addTags(f'id={ID}')
        ID += 1
        bar.update()

bar.close()
print(f"Found Fbn2 file for each CTCF file and assigned {ID} IDs")

0it [00:00, ?it/s]

Found Fbn2 file for each CTCF file and assigned 371 IDs


In [6]:
data.makeSelection()
ids = sorted(tag for tag in data.tagset() if tag.startswith('id='))
data.makeSelection(ids, logic=lambda x: not any(x))

files = sorted(tag[5:] for tag in data.tagset() if tag.startswith('file='))
if len(files) > 0:
    print("Data from the following files could not be CTCF/Fbn2 matched:")
    for f in files:
        print('  ', f)
else:
    print("All data successfully matched for CTCF/Fbn2")

All data successfully matched for CTCF/Fbn2


# Get pairwise trajectories

In [7]:
def SD(xm, xn): # for MSD calculation
    return np.sum((xm-xn)**2, axis=-1)

def parfun(id_tag):
    data.makeSelection([id_tag, 'Fbn2'], logic=all)
    data_fbn2 = data.copySelection()
    data.makeSelection([id_tag, 'CTCF'], logic=all)
    data.refineSelection(lambda traj, _: traj.F >= 20)
    if '100ms' in data.tagset(): # tighter filter to get rid of unbound CTCFs
        data.refineSelection(lambda traj, _: traj.F >= 50)
    data_out = nl.TaggedSet()

    for i, (traj0, tags) in enumerate(data_fbn2(giveTags=True)):
        rf0 = traj0.meta['real frame']
        rf0 = rf0[~np.any(np.isnan(traj0[:]), axis=-1)]
        for j in range(len(data)):
            traj1 = data[j]
            rf1 = traj1.meta['real frame']
            rf1 = rf1[~np.any(np.isnan(traj1[:]), axis=-1)]

            rf_min = min(rf0.min(), rf1.min())
            rf_max = max(rf0.max(), rf1.max())
            cnt = np.zeros(rf_max+1-rf_min, dtype=int)
            cnt[rf0-rf_min] += 1
            cnt[rf1-rf_min] += 2
            
            valid = np.nonzero(cnt == 3)[0]
            if len(valid) < 20: # the original data is also cut to >= 20 valid frames
                continue
                
            valid_rf = valid + rf_min
                
            traj_dat = np.empty((2, valid_rf[-1]+1-valid_rf[0], traj0.d), dtype=float)
            traj_dat[:] = np.nan
            traj_dat[0, valid_rf - valid_rf[0]] = traj0[valid_rf - rf0.min()]
            traj_dat[1, valid_rf - valid_rf[0]] = traj1[valid_rf - rf1.min()]
            
            traj = nl.Trajectory(traj_dat)
            traj.meta['real frame'] = np.arange(valid_rf[0], valid_rf[-1]+1)
            traj.meta['original data index Fbn2'] = i
            traj.meta['original data index CTCF'] = j
            
            # Precompute MSDs
            # Note the use of preproc=... to calculate the correct MSD for the two-locus trajectories
            # This will allow us to naively use nl.analysis.MSD in the future
            # Do this here (instead of later), because it works nicely with parallelization
            _ = nl.analysis.p2.P2(traj, function=SD, writeto='MSD', preproc=lambda traj: traj.relative())
            
            data_out.add(traj, tags.copy())
            
    return data_out

data.makeSelection()
id_tags = {tag for tag in data.tagset() if tag.startswith('id=')}

data2 = nl.TaggedSet()
todo = id_tags
with Pool() as mypool:
    imap = mypool.imap_unordered(parfun, todo)
    imap = tqdm(imap, total=len(todo))
    for dat in imap:
        data2 |= dat

print(len(data2))

  0%|          | 0/371 [00:00<?, ?it/s]

11334


In [9]:
out_filename = '../../data/20250411_SPT_array_CTCF.h5'

data.makeSelection()
nl.io.write.hdf5({
    'data' : data,
    'conditions' : ['NT', 'DMSO', 'IAA'],
    'description' : """
SPT tracking for Fbn2 cell lines

    NT     C36
    DMSO   ΔRAD21 (inactive)
    IAA    ΔRAD21 (active)
    
Framerates are 100ms or 2s, exposure is 86.71ms for all data.

For all conditions we have three biological replicates, tagged 'rep1/2/3'.
"""[1:-1]}, out_filename)

data2.makeSelection()
nl.io.write.hdf5(data2, out_filename, '/data_twoLocus_all')
nl.io.write.hdf5(nl.io.load.hdf5(out_filename, 'description') + r"""

The group 'data_twoLocus_all' contains pairwise trajectories between the
Fbn2 locus and CTCF tracks.
"""[1:], out_filename, 'description')

for sep in [1, 3, 5]:
    data2.makeSelection(lambda traj, _:
                        np.nanmean(traj.relative().abs()[:][:, 0]) < sep,
                       )
    nl.io.write.hdf5_subTaggedSet(data2, out_filename,
                                  f'/data_twoLocus_maxsep-{sep}um',
                                  refTaggedSet='/data_twoLocus_all',
                                 )
nl.io.write.hdf5(nl.io.load.hdf5(out_filename, 'description') + r"""

The groups 'data_twoLocus_maxsep_Xum' are subsets of the two-locus data with
distance cutoffs 1, 3, 5 μm, respectively.
"""[1:], out_filename, 'description')