In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%reload_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import sys
sys.path.append('../src')

from datetime import datetime
import os
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view as sww
import pandas as pd
import re

from metadata import metadata
from utils import list_subdir_filter as lsd, unique

global md
md = metadata()

global markers
markers = list(md.markers)

In [13]:
def get_id(f):
    """
    All images are identified by a 4-digit number which we use as main ID from now onwards
    """
    if re.search('A40(\.|_)', f):
        return re.sub('^.*A40.*([0-9]{4}).*$', '\\1', f)
    else:
        return re.sub('^.*segmentation_([0-9]+).*$', '\\1', f)

def date_of_files(files):
    try:
        return max([os.path.getctime(f) for f in files if os.path.exists(f)])
    except:
        return 0


# get the IDs for which we have folders/images/segmentations/aggregated pickles
all_samples = lsd(os.path.join(md.folders['images'], 'mouse'))
all_imgs = lsd(os.path.join(md.folders['images'], 'mouse'), True, '\.tif')
all_segs = lsd(md.folders['segmented'], True)
pickle_files = lsd(md.folders['regionprops'], False, 'A40_')


all_samples_ids, all_imgs_ids, all_segs_ids, all_pickles = list(map(
    lambda x: unique([get_id(f) for f in x]), 
    [all_samples, all_imgs, all_segs, pickle_files]
))



In [27]:
# import the original excel file as a pandas dataframe and add 

xfile = pd.read_excel(lsd(md.folders['images'], True, 'xls')[0])
xfile['id_code'] = [re.sub('A40\.', '', l) for l in xfile.Identification]
xfile['samples'] = xfile['id_code'].isin(all_samples_ids)
xfile['images'] = xfile['id_code'].isin(all_imgs_ids)
xfile['segmentation'] = xfile['id_code'].isin(all_segs_ids)
xfile['regionprops'] = xfile['id_code'].isin(all_pickles)

In [30]:
# figure out for what samples (and when) we have already removed the autofluorescence background etc

xfile['complete_AF_removal'] = [
    sum([
        os.path.exists(os.path.join(md.folders['bg_removed'], f'clean_{marker}_{img_id}.npy')) 
        for marker in markers]
       ) == 9 
    for img_id in xfile.id_code
]

xfile['AF_time'] = [
    datetime.fromtimestamp(date_of_files([
        os.path.join(md.folders['bg_removed'], f'clean_{marker}_{img_id}.npy')
        for marker in markers
    ])).strftime('%Y.%m.%d %H:%M')
    for img_id in xfile.id_code
    ]

seg_dir = os.path.join(md.folders['segmented'], 'cellpose')
xfile['complete_segmentation'] = [(len(lsd(seg_dir, True, f'A40_{i}_[0-9]_dapi1\.npy')) == 10) for i in xfile.id_code]
xfile.loc[xfile.samples].sort_values(['Condition', 'id_code'])

Unnamed: 0,Code,Identification,Group,Condition,File,id_code,samples,images,segmentation,regionprops,complete_AF_removal,AF_time,complete_segmentation
12,JNUDE_16,A40.2419,ko_osi_2,ko_TT,20230914_173737_4_Er644v_Run 53_A40.2419.csv,2419,True,True,True,True,True,2024.06.14 09:39,True
13,JNUDE_17,A40.2429,ko_osi_3,ko_TT,non,2429,True,True,True,False,True,2024.06.14 11:11,True
16,JNUDE_22,A40.2430,ko_osi_6,ko_TT,20230918_152613_3_9pzeAy_Run 55_A40.2430.csv,2430,True,True,True,True,True,2024.06.14 10:26,True
19,JNUDE_24,A40.2300,ko_ctr_1,ko_ctr,20230831_152523_1_ExRj6r_Run52_A40.2300.csv,2300,True,True,True,True,True,2024.06.14 08:56,True
23,JNUDE_28,A40.2338,ko_ctr_7,ko_ctr,20230829_102724_2_4kJsI2_Run48_A40.2338.csv,2338,True,True,False,False,True,2024.06.14 08:41,False
18,JNUDE_23_li,A40.2393,ko_ctr_3,ko_ctr,20230921_134356_1_EgBQRX_Run 59_A40.2393.csv,2393,True,True,True,True,True,2024.06.14 10:55,True
15,JNUDE_19,A40.2416,ko_ctr_2,ko_ctr,20230914_173736_3_D61Agp_Run 53_A40.2416.csv,2416,True,True,True,True,True,2024.06.14 09:33,True
21,JNUDE_26,A40.2427,ko_ctr_5,ko_ctr,20230830_123250_4_QIfXt0_Run49_A40.2427.csv,2427,True,True,True,True,True,2024.06.14 08:49,True
20,JNUDE_25,A40.2432,ko_ctr_4,ko_ctr,20230920_173137_3_371MkO_Run 57_A40.2432.csv,2432,True,True,False,False,True,2024.06.14 10:48,False
22,JNUDE_27,A40.2433,ko_ctr_6,ko_ctr,non,2433,True,False,False,False,False,1970.01.01 01:00,False


In [26]:
# prepare and export a simple csv keeping track of which condition belongs each ID to (and if there's
# already everything we need)

id_condition_table = xfile.loc[xfile.img, ['id_code', 'Condition', 'complete_AF_removal', 'complete_segmentation', 'regions']].sort_values(['Condition', 'id_code']).reset_index(drop=True)
id_condition_table.to_csv('../data/interim/id_condition_table.csv')
id_condition_table

Unnamed: 0,id_code,Condition,complete_AF_removal,complete_segmentation,regions
0,2419,ko_TT,True,True,True
1,2429,ko_TT,True,True,False
2,2430,ko_TT,True,True,True
3,2300,ko_ctr,True,True,True
4,2338,ko_ctr,True,False,False
5,2393,ko_ctr,True,True,True
6,2416,ko_ctr,True,True,True
7,2427,ko_ctr,True,True,True
8,2432,ko_ctr,True,False,False
9,2407,res_TT,False,True,False
