# Import packages

In [1]:
import os

import h5py
import sys
import numpy as np
import pandas as pd
import copy
import tqdm
import matplotlib.pyplot as plt

import ast
sys.path.append(r"C:\Users\shiwei\Documents")

print(os.getpid())

44808


# Define functions

## functions to convert data format

In [2]:
# funtion to convert
def format_4dn_dataframe_trace (spot_df, pixel_size, required_cols,
                                #ExpFovCell_2_uid  #old version
                                 ):
    
    from tqdm import tqdm
    print (f'Process dataframe with {len(spot_df)} spots.')
    
    # 1. get cell uid as CellID
    ## old version that add uid now/ comment out
    #df_4dn = spot_df[['FOV','orig_cellID']]   
    #df_4dn['FOV_ID']= df_4dn['FOV'].apply(lambda x: str(x))
    #df_4dn['CellID_byFOV'] = df_4dn['orig_cellID'].apply(lambda x: x.split('Cell-')[1])
    #def ExpFovCell_2_uid_from_FovCell (Fov_id, Cell_id, ExpFovCell_2_uid, Exp_name='None'):
        #if Exp_name=='None':
            #Exp_name = list(ExpFovCell_2_uid.keys())[0][0]
        #Cell_uid = ExpFovCell_2_uid[Exp_name,Fov_id, Cell_id]
        #return Cell_uid
        
    #spot_FovCell_list = []
    #for Fov_id, Cell_id in tqdm(zip(df_4dn['FOV'],df_4dn['orig_cellID'])):
    #for Fov_id, Cell_id in zip(df_4dn['FOV_ID'],df_4dn['CellID_byFOV']):
        #spot_FovCell_list.append(ExpFovCell_2_uid_from_FovCell (Fov_id, Cell_id, ExpFovCell_2_uid, Exp_name='None'))
    #df_4dn['Cell_ID'] = df_4dn.apply(lambda x: ExpFovCell_2_uid_from_FovCell(x['FOV'],x['orig_cellID'],ExpFovCell_2_uid),axis=1)
    #df_4dn['Cell_ID']=spot_FovCell_list
    
    ## new version that directly reads in uid from df
    df_4dn = spot_df[['FOV','orig_cellID','finalcellID',]]
    df_4dn['FOV_ID']= df_4dn['FOV'].apply(lambda x: str(x))
    df_4dn['CellID_byFOV'] = df_4dn['orig_cellID'].apply(lambda x: x.split('Cell-')[1])
    df_4dn['Cell_ID']= df_4dn['finalcellID'].apply(lambda x: str(x))
    
    
    ##################################################
    # 2. get other columns
    other_cols = [_c for _c in required_cols if _c not in ['FOV','orig_cellID','finalcellID',]]
    df_4dn[other_cols] = spot_df[other_cols]
    df_4dn[['Z','X','Y']] = df_4dn[['z_hat','x_hat','y_hat']] * pixel_size / 1000
    #df_4dn[['Chrom','Chrom_Start','Chrom_End','Chrom_order']] = df_4dn[['chr','start', 'end','hyb']]
    df_4dn[['Chrom','Chrom_order']] = df_4dn[['chr','hyb']]
    df_4dn['Chrom_Start'] = df_4dn['region_name'].apply(lambda x: x.split(':')[1].split('-')[0])
    df_4dn['Chrom_End'] = df_4dn['region_name'].apply(lambda x: x.split(':')[1].split('-')[1])
    
    df_4dn['Trace_ID'] = df_4dn.apply(lambda x: '_'.join([x['FOV_ID'],
                                                          x['CellID_byFOV'],
                                                          x['Chrom'].split('chr')[1],
                                                          str(x['fiberidx'])]),axis=1)
    
    df_4dn['Spot_ID'] = df_4dn['rowID']
    
    df_4dn.drop(columns=required_cols, inplace=True)
    df_4dn=df_4dn[['Spot_ID','Trace_ID','X','Y','Z','Chrom','Chrom_Start','Chrom_End','Chrom_order','Cell_ID','FOV_ID','CellID_byFOV']]
    
    return df_4dn





import multiprocessing as mp
from functools import partial
import inspect

# create temp py to parallel
def parallel_task(func, zipped_iterables):
    import os, time
    cwd = os.getcwd()
    print ("Write in the function to multiprocess as a temp file.")
    
    with open(os.path.join(cwd,f'tmp_func.py'), 'w') as file:
        file.write(inspect.getsource(func).replace(func.__name__, "function_to_mp"))
        
    from tmp_func import function_to_mp

    if __name__ == '__main__':
        start_time = time.time()
        with mp.Pool(16) as mp_pool:
            mp_res = mp_pool.starmap(function_to_mp, zipped_iterables,chunksize=1)
            mp_pool.close()
            mp_pool.join()
            mp_pool.terminate()
        
        elapsed_time = time.time() - start_time
        hours, rem = divmod(elapsed_time, 3600)
        minutes, seconds = divmod(rem, 60)
        print (f"Complete multiprocess; remove the temp file for the function.")
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
        if os.path.exists(os.path.join(cwd,f'tmp_func.py')):
            os.remove(os.path.join(cwd,f'tmp_func.py'))
        
        return mp_res
    else:
        raise "Not in Jupyter Notebook"

## prepare header lines 

In [3]:
table_name_list = ['4dn_FOF-CT_core', '4dn_FOF-CT_cell', '4dn_FOF-CT_rna_spot', '4dn_FOF-CT_demultiplexing']
curr_table_name = '4dn_FOF-CT_core'
add_table_name = ', '.join([t for t in table_name_list if t!=curr_table_name])
add_table_name

'4dn_FOF-CT_cell, 4dn_FOF-CT_rna_spot, 4dn_FOF-CT_demultiplexing'

In [4]:
# change header line below manually
header_dict_v0 = {'##FOF-CT_version=':'v0.1',
                  '##Table_namespace=':curr_table_name,
                  '#lab_name:':'Xiaowei Zhuang lab', 
                  '#experimenter_name:':'Pu Zheng, Shiwei Liu',
                  '#experimenter_contact:':'pu_zheng@g.harvard.edu, shiweiliu@fas.harvard.edu',
                  '#experiment_description:':'This experiment set contains integrated RNA- and DNA-MERFISH of the mouse MOp region and a portion of adjacent regions',
                  '#table_description:':'This table contains picked DNA spots for DNA fibers in inidividual cells from the mouse MOp region and a portion of adjacent regions',
                  '#Software_Title:':'ImageAnalysis3, jie',
                  '#Software_Type:':'SpotLoc(by ImageAnalysis3), Tracing(by jie)',
                  '#Software_Authors:':'Pu Zheng(ImageAnalysis3), Blair Bojing Jia(jie)',
                  '#Software_Description:':'ImageAnalysis3 for decoding and partition, jie for picking',
                  '#Software_Repository:':'https://github.com/zhengpuas47/ImageAnalysis3, https://github.com/b2jia/jie',
                  '#Software_PreferredCitationID:':'https://doi: 10.1016/j.cell.2020.07.032, https://doi.org/10.1101/2022.03.25.485845',
                  #'#additional_tables:':'4dn_FOF-CT_rna, 4dn_FOF-CT_quality, 4dn_FOF-CT_bio, 4dn_FOF-CT_demultiplexing, 4dn_FOF-CT_trace, 4dn_FOF-CT_cell',
                  '#additional_tables:':add_table_name,
                  '##genome_assembly=':'GRCm38(mm10)',
                  '##XYZ_unit=':'nm',
                  '##columns=':'(Spot_ID, Trace_ID, X, Y, Z, Chrom, Chrom_Start, Chrom_End, Chrom_order, Cell_ID, FOV_ID, CellID_byFOV)',
                  '#Spot_ID:':'unique DNA spot identifier across 4dn_FOF-CT_core and 4dn_FOF-CT_demultiplxeing for the same replicate',
                  '#Trace_ID:':'number format as fovXX_cellXX_chrXX_chromatidXX',
                  '#Chrom_order:':'order of DNA region on its respective chromosome',
                  '#Cell_ID:':'unique cell identifier across the datatables',
                  '#FOV:':'FOV ID across datatables for the same replicate',
                  '#CellID_byFOV':'specific cell identifier relative to the FOV',

                
                 }


In [5]:
header_dict_v0

{'##FOF-CT_version=': 'v0.1',
 '##Table_namespace=': '4dn_FOF-CT_core',
 '#lab_name:': 'Xiaowei Zhuang lab',
 '#experimenter_name:': 'Pu Zheng, Shiwei Liu',
 '#experimenter_contact:': 'pu_zheng@g.harvard.edu, shiweiliu@fas.harvard.edu',
 '#experiment_description:': 'This experiment set contains integrated RNA- and DNA-MERFISH of the mouse MOp region and a portion of adjacent regions',
 '#table_description:': 'This table contains picked DNA spots for DNA fibers in inidividual cells from the mouse MOp region and a portion of adjacent regions',
 '#Software_Title:': 'ImageAnalysis3, jie',
 '#Software_Type:': 'SpotLoc(by ImageAnalysis3), Tracing(by jie)',
 '#Software_Authors:': 'Pu Zheng(ImageAnalysis3), Blair Bojing Jia(jie)',
 '#Software_Description:': 'ImageAnalysis3 for decoding and partition, jie for picking',
 '#Software_Repository:': 'https://github.com/zhengpuas47/ImageAnalysis3, https://github.com/b2jia/jie',
 '#Software_PreferredCitationID:': 'https://doi: 10.1016/j.cell.2020.07.0

In [6]:
# convert header lines to dataframe
header_lines = [k+v for k,v in header_dict_v0.items()]
header_lines_df = pd.DataFrame(header_lines)
header_lines_df

Unnamed: 0,0
0,##FOF-CT_version=v0.1
1,##Table_namespace=4dn_FOF-CT_core
2,#lab_name:Xiaowei Zhuang lab
3,"#experimenter_name:Pu Zheng, Shiwei Liu"
4,"#experimenter_contact:pu_zheng@g.harvard.edu, ..."
5,#experiment_description:This experiment set co...
6,#table_description:This table contains picked ...
7,"#Software_Title:ImageAnalysis3, jie"
8,"#Software_Type:SpotLoc(by ImageAnalysis3), Tra..."
9,"#Software_Authors:Pu Zheng(ImageAnalysis3), Bl..."


# Load and convert spot dataframe

## define parameters for replicate

In [7]:
# pick main folder; where sorted spots are saved
pick_folder = r'\\10.245.74.158\Chromatin_NAS_8\Exported_data\jie_CW_decode'
# paramter version used for pick
pick_param_name = 'CW_v2_sorted'

# define name and other metadata info in correct order for each experiment below
exp_name_list = ['20220316exp', '20220402exp', '20220419exp', '20220713exp']
RNA_experiment_ID_list = ['20220304','20220329','20220415','20220418']
DNA_experiment_ID_list = ['20220316','20220402','20220419','20220713']
sample_ID_list = ['C57BL/6_M_1_MOp_1','C57BL/6_M_2_MOp_1','C57BL/6_M_3_MOp_1','C57BL/6_M_3_MOp_2']

exp_sample_type = 'WT' # to further distinguish the output exp name in case the date is the same
# naming for 4DN alias
Bio_num_list = [1,2,3,3]
Tech_num_list = [1,1,1,2]
table_type = curr_table_name.split('4dn_FOF-CT_')[1]


# output folder to save the converted df
output_folder =r'F:\4DN_deposit\new_version'


In [8]:
# shared parameters
# required cols from the data to be processed
required_cols = ['FOV', 'chr', #'start', 'end',
       'hyb', 'x_hat', 'y_hat', 'z_hat', 
        'finalcellID', 'region_name',
       'orig_cellID','fiberidx','rowID']

pixel_size = 108 #'z_hat','x_hat','y_hat' are pseduo pixels where z_hat unit is equvilant to x_hat 

## process data

In [9]:
# loop through all replicate and batch process each
for _exp_ind, _exp_name in enumerate(exp_name_list):

    pick_subfolder = os.path.join(pick_folder,_exp_name,pick_param_name)
    
    experiment_key= RNA_experiment_ID_list[_exp_ind] + '_' + exp_sample_type
    #data_4dn_savename = os.path.join(output_folder, f'{experiment_key}_core_table.csv')
    Bio_num = Bio_num_list[_exp_ind]
    Tech_num = Tech_num_list[_exp_ind]
    data_4dn_savename = os.path.join(output_folder, f'fileproc_genome_wide_brain_ct_B{Bio_num}_T{Tech_num}_{table_type}_f1.csv')
    if os.path.exists(data_4dn_savename):
        print (f'Output already exists for {experiment_key}. Skip.')
        
    else:
        # output dict to load chromosome pick h5
        output_df_dict = {}

        if os.path.exists(pick_subfolder):
            # 1. load data
            file_list = [os.path.join(pick_subfolder, _fl) for _fl in os.listdir(pick_subfolder) if '.h5' in _fl]
            for file in file_list:
                output_df_dict[os.path.basename(file)]=pd.read_hdf(file)
            print(f'Load the pick result as {list(output_df_dict.keys())[0]}', end=' ')
            print(f'with {len(list(output_df_dict.keys()))} files in total.')

            # 2. batch convert data
            print(f'Start processing.')
            spot_df_list = list(output_df_dict.values())
            pixel_size_list = [pixel_size,]*len(spot_df_list)
            required_cols_list = [required_cols,]*len(spot_df_list)

            df_4dn_list = parallel_task(format_4dn_dataframe_trace, zip(spot_df_list,
                                                          pixel_size_list,
                                                          required_cols_list,
                                                          #ExpFovCell_2_uid_list
                                                               ))


            df_4dn_all= pd.concat(df_4dn_list)
            df_4dn_all['RNA_experiment_ID'] = RNA_experiment_ID_list[_exp_ind]
            df_4dn_all['DNA_experiment_ID'] = DNA_experiment_ID_list[_exp_ind]
            df_4dn_all['Sample_ID'] = sample_ID_list[_exp_ind]

            # 3. reset index after pd concat
            df_4dn_all.reset_index(inplace=True,drop=True)
            df_4dn_all_sort = df_4dn_all.sort_values(by='Spot_ID')


            # 4. save output
            # modify column names and add 4dn header lines
            header_lines_df_fill = header_lines_df.reindex(columns = list(np.arange(0,len(df_4dn_all_sort.columns))))
            df_4dn_new = pd.DataFrame(np.vstack([df_4dn_all_sort.columns,df_4dn_all_sort]))
            df_4dn_clean = pd.concat([header_lines_df_fill,df_4dn_new])

            df_4dn_clean.to_csv(data_4dn_savename, header=None, index=False)
            print(f'Result saved.')


        else:
            print ('Pick results not availale. Check results.')

Load the pick result as 20220316exp_pick_chr1.h5 with 20 files in total.
Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:00:31.04
Result saved.
Load the pick result as 20220402exp_pick_chr1.h5 with 20 files in total.
Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:00:30.72
Result saved.
Load the pick result as 20220419exp_pick_chr1.h5 with 20 files in total.
Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:00:29.86
Result saved.
Load the pick result as 20220713exp_pick_chr1.h5 with 20 files in total.
Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:00:39.92
Result saved.
