# Import packages

In [1]:
import os

import h5py
import sys
import numpy as np
import pandas as pd
import copy
import tqdm
import matplotlib.pyplot as plt

import ast
sys.path.append(r"C:\Users\shiwei\Documents")

print(os.getpid())

36244


# Define functions

## functions to convert data format

In [2]:
# funtion to convert
# funtion to convert
def format_4dn_dataframe_demultiplexing (spot_df, pixel_size, required_cols,
                                #ExpFovCell_2_uid  #old version
                                 ):
    
    from tqdm import tqdm
    print (f'Process dataframe with {len(spot_df)} spots.')
    
    df_4dn = spot_df[['FOV','orig_cellID',]]
    df_4dn['FOV_ID']= df_4dn['FOV'].apply(lambda x: str(x))
    df_4dn['CellID_byFOV'] = df_4dn['orig_cellID'].apply(lambda x: x.split('Cell-')[1])

    ##################################################
    # 2. get other columns
    other_cols = [_c for _c in required_cols if _c not in ['FOV','orig_cellID',]]
    df_4dn[other_cols] = spot_df[other_cols]
    df_4dn[['Z','X','Y']] = df_4dn[['z_hat','x_hat','y_hat']] * pixel_size / 1000

    df_4dn=df_4dn[['X','Y','Z',
                   'center_intensity','center_internal_dist',
                 'FOV','orig_cellID', 'region_name',]]
    
    return df_4dn



import multiprocessing as mp
from functools import partial
import inspect

# create temp py to parallel
def parallel_task(func, zipped_iterables):
    import os, time
    cwd = os.getcwd()
    print ("Write in the function to multiprocess as a temp file.")
    
    with open(os.path.join(cwd,f'tmp_func.py'), 'w') as file:
        file.write(inspect.getsource(func).replace(func.__name__, "function_to_mp"))
        
    from tmp_func import function_to_mp

    if __name__ == '__main__':
        start_time = time.time()
        with mp.Pool(16) as mp_pool:
            mp_res = mp_pool.starmap(function_to_mp, zipped_iterables,chunksize=1)
            mp_pool.close()
            mp_pool.join()
            mp_pool.terminate()
        
        elapsed_time = time.time() - start_time
        hours, rem = divmod(elapsed_time, 3600)
        minutes, seconds = divmod(rem, 60)
        print (f"Complete multiprocess; remove the temp file for the function.")
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
        if os.path.exists(os.path.join(cwd,f'tmp_func.py')):
            os.remove(os.path.join(cwd,f'tmp_func.py'))
        
        return mp_res
    else:
        raise "Not in Jupyter Notebook"

## prepare header lines 

In [3]:
table_name_list = ['4dn_FOF-CT_core', '4dn_FOF-CT_cell', '4dn_FOF-CT_rna_spot', '4dn_FOF-CT_demultiplexing']
curr_table_name = '4dn_FOF-CT_demultiplexing'
add_table_name = ', '.join([t for t in table_name_list if t!=curr_table_name])
add_table_name

'4dn_FOF-CT_core, 4dn_FOF-CT_cell, 4dn_FOF-CT_rna_spot'

In [4]:
# change header line below manually
header_dict_v0 = {'##FOF-CT_version=':'v0.1',
                  '##Table_namespace=':curr_table_name,
                  '#lab_name:':'Xiaowei Zhuang lab', 
                  '#experimenter_name:':'Pu Zheng, Shiwei Liu',
                  '#experimenter_contact:':'pu_zheng@g.harvard.edu, shiweiliu@fas.harvard.edu',
                  '#experiment_description:':'This experiment set contains integrated RNA- and DNA-MERFISH of the mouse MOp region and a portion of adjacent regions',
                  '#table_description:':'This table contains decoded candidate DNA spots from inidividual cells from the mouse MOp region and a portion of adjacent regions',
                  '#Software_Title:':'ImageAnalysis3',
                  '#Software_Type:':'SpotLoc(by ImageAnalysis3)',
                  '#Software_Authors:':'Pu Zheng(ImageAnalysis3)',
                  '#Software_Description:':'ImageAnalysis3 for decoding and partition',
                  '#Software_Repository:':'https://github.com/zhengpuas47/ImageAnalysis3',
                  '#Software_PreferredCitationID:':'https://doi: 10.1016/j.cell.2020.07.032',
                  #'#additional_tables:':'4dn_FOF-CT_rna, 4dn_FOF-CT_quality, 4dn_FOF-CT_bio, 4dn_FOF-CT_demultiplexing, 4dn_FOF-CT_trace, 4dn_FOF-CT_cell',
                  '#additional_tables:':add_table_name,
                  '##genome_assembly=':'GRCm38(mm10)',
                  '##XYZ_unit=':'nm',
                  '##columns=':'(Spot_ID, X, Y, Z, Cell_ID, FOV_ID, CellID_byFOV)',
                  '##Intensity_measurement_method':'Spot centroid intensity by 3D Gaussian function',
                  '##intensity_unit=':'a.u.',
                  '#Loc_ID:':'Same as below',
                  '#Spot_ID:':'unique DNA spot identifier across 4dn_FOF-CT_core and 4dn_FOF-CT_demultiplxeing for the same replicate',
                  '#X:':'X coordinate of the fitted spot relative to the FOV',
                  '#Y:':'Y coordinate of the fitted spot relative to the FOV',
                  '#Z:':'Z coordinate of the fitted spot relative to the FOV',
                  '#center_intensity':'Fitted spot intensity by 3D Gaussian function',
                  '#center_internal_dist':'Fitted spot distance metrics by 3D Gaussian function',
                  '#FOV:':'FOV ID across datatables for the same replicate',
                  '#CellID_byFOV':'specific cell identifier relative to the FOV',
                  '#region_name':'encoding name of the fitted spot according to the DNA-MERFISH codebook',

                 }


In [5]:
header_dict_v0

{'##FOF-CT_version=': 'v0.1',
 '##Table_namespace=': '4dn_FOF-CT_demultiplexing',
 '#lab_name:': 'Xiaowei Zhuang lab',
 '#experimenter_name:': 'Pu Zheng, Shiwei Liu',
 '#experimenter_contact:': 'pu_zheng@g.harvard.edu, shiweiliu@fas.harvard.edu',
 '#experiment_description:': 'This experiment set contains integrated RNA- and DNA-MERFISH of the mouse MOp region and a portion of adjacent regions',
 '#table_description:': 'This table contains decoded candidate DNA spots from inidividual cells from the mouse MOp region and a portion of adjacent regions',
 '#Software_Title:': 'ImageAnalysis3',
 '#Software_Type:': 'SpotLoc(by ImageAnalysis3)',
 '#Software_Authors:': 'Pu Zheng(ImageAnalysis3)',
 '#Software_Description:': 'ImageAnalysis3 for decoding and partition',
 '#Software_Repository:': 'https://github.com/zhengpuas47/ImageAnalysis3',
 '#Software_PreferredCitationID:': 'https://doi: 10.1016/j.cell.2020.07.032',
 '#additional_tables:': '4dn_FOF-CT_core, 4dn_FOF-CT_cell, 4dn_FOF-CT_rna_spot'

In [6]:
# convert header lines to dataframe
header_lines = [k+v for k,v in header_dict_v0.items()]
header_lines_df = pd.DataFrame(header_lines)
header_lines_df

Unnamed: 0,0
0,##FOF-CT_version=v0.1
1,##Table_namespace=4dn_FOF-CT_demultiplexing
2,#lab_name:Xiaowei Zhuang lab
3,"#experimenter_name:Pu Zheng, Shiwei Liu"
4,"#experimenter_contact:pu_zheng@g.harvard.edu, ..."
5,#experiment_description:This experiment set co...
6,#table_description:This table contains decoded...
7,#Software_Title:ImageAnalysis3
8,#Software_Type:SpotLoc(by ImageAnalysis3)
9,#Software_Authors:Pu Zheng(ImageAnalysis3)


# Load and convert spot dataframe

## define parameters for replicate

In [7]:
# pick main folder; where sorted spots are saved
export_main_folder = r'\\10.245.74.158\Chromatin_NAS_8\Exported_data\jie_CW_decode'
# paramter version used for pick
cand_file_name = 'cand_spots_part'

# define name and other metadata info in correct order for each experiment below
exp_name_list = ['20220316exp', '20220402exp', '20220419exp', '20220713exp']
RNA_experiment_ID_list = ['20220304','20220329','20220415','20220418']
DNA_experiment_ID_list = ['20220316','20220402','20220419','20220713']
sample_ID_list = ['C57BL/6_M_1_MOp_1','C57BL/6_M_2_MOp_1','C57BL/6_M_3_MOp_1','C57BL/6_M_3_MOp_2']

exp_sample_type = 'WT' # to further distinguish the output exp name in case the date is the same
# naming for 4DN alias
Bio_num_list = [1,2,3,3]
Tech_num_list = [1,1,1,2]
table_type = curr_table_name.split('4dn_FOF-CT_')[1]

# output folder to save the converted df
output_folder =r'F:\4DN_deposit\new_version'


In [8]:
# shared parameters
# required cols from the data to be processed
required_cols = ['x_hat', 'y_hat', 'z_hat', 
                 'center_intensity','center_internal_dist',
                 'FOV','orig_cellID', 'region_name', 
                ]

pixel_size = 108 #'z_hat','x_hat','y_hat' are pseduo pixels where z_hat unit is equvilant to x_hat 

## process data

In [10]:
# loop through all replicate and batch process each
import tqdm
for _exp_ind, _exp_name in enumerate(exp_name_list[:]):
    experiment_key= RNA_experiment_ID_list[_exp_ind] + '_' + exp_sample_type
    data_4dn_savename = os.path.join(output_folder, f'{experiment_key}_demultiplexing_table.csv')
    Bio_num = Bio_num_list[_exp_ind]
    Tech_num = Tech_num_list[_exp_ind]
    data_4dn_savename = os.path.join(output_folder, f'fileproc_genome_wide_brain_ct_B{Bio_num}_T{Tech_num}_{table_type}_f1.csv')
    if os.path.exists(data_4dn_savename):
        print (f'Output already exists for {experiment_key}. Skip.')
    
    else:
        # 1. load subdf into list for mp
        spot_df_list = []
        export_data_folder = os.path.join(export_main_folder, exp_name_list[_exp_ind])
        decoded_file_list = [os.path.join(export_data_folder, _fl) for _fl in os.listdir(export_data_folder) if '.csv' in _fl]
        decoded_file_list = [_fl for _fl in decoded_file_list if cand_file_name in _fl]

        for _decoded_file in tqdm.tqdm(decoded_file_list[:]):
            spot_df_list.append(pd.read_csv(_decoded_file))

        # 2. batch convert data
        print(f'Start processing.')
        pixel_size_list = [pixel_size,]*len(spot_df_list)
        required_cols_list = [required_cols,]*len(spot_df_list)

        df_4dn_list = parallel_task(format_4dn_dataframe_demultiplexing, zip(spot_df_list,
                                                      pixel_size_list,
                                                      required_cols_list,
                                                      #ExpFovCell_2_uid_list
                                                           ))
        
        
        df_4dn_all= pd.concat(df_4dn_list)
        df_4dn_all['RNA_experiment_ID'] = RNA_experiment_ID_list[_exp_ind]
        df_4dn_all['DNA_experiment_ID'] = DNA_experiment_ID_list[_exp_ind]
        df_4dn_all['Sample_ID'] = sample_ID_list[_exp_ind]

        # 3. reset index after pd concat and add Spot ID
        ## note the Spot ID is the original index after merging csv parts; which goes from 0, 10, 1, 2... by deafult
        ## do not change this to make sure the Spot ID is correct across core table and this table
        df_4dn_all.reset_index(inplace=True)
        df_4dn_all.reset_index(inplace=True)
        df_4dn_all = df_4dn_all.rename(columns={'level_0':'Loc_ID', 'index':'Spot_ID'})
        df_4dn_all_sort = df_4dn_all.sort_values(by='Spot_ID')


        # 4. save output
        # modify column names and add 4dn header lines
        header_lines_df_fill = header_lines_df.reindex(columns = list(np.arange(0,len(df_4dn_all_sort.columns))))
        df_4dn_new = pd.DataFrame(np.vstack([df_4dn_all_sort.columns,df_4dn_all_sort]))
        df_4dn_clean = pd.concat([header_lines_df_fill,df_4dn_new])

        df_4dn_clean.to_csv(data_4dn_savename, header=None, index=False)
        print(f'Result saved.')

      


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [02:39<00:00, 11.39s/it]


Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:00:55.51
Result saved.


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [02:28<00:00, 10.59s/it]


Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:00:52.92
Result saved.


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [03:29<00:00, 11.61s/it]


Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:01:08.91
Result saved.


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [02:43<00:00, 10.91s/it]


Start processing.
Write in the function to multiprocess as a temp file.
Complete multiprocess; remove the temp file for the function.
00:01:02.83
Result saved.


In [14]:
df_4dn_clean[20:40]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
20,#Spot_ID:unique DNA spot identifier across 4dn...,,,,,,,,,,,,
21,#X:X coordinate of the fitted spot relative to...,,,,,,,,,,,,
22,#Y:Y coordinate of the fitted spot relative to...,,,,,,,,,,,,
23,#Z:Z coordinate of the fitted spot relative to...,,,,,,,,,,,,
24,#center_intensityFitted spot intensity by 3D G...,,,,,,,,,,,,
25,#center_internal_distFitted spot distance metr...,,,,,,,,,,,,
26,#FOV:FOV ID across datatables for the same rep...,,,,,,,,,,,,
27,#CellID_byFOVspecific cell identifier relative...,,,,,,,,,,,,
28,#region_nameencoding name of the fitted spot a...,,,,,,,,,,,,
0,Loc_ID,Spot_ID,X,Y,Z,center_intensity,center_internal_dist,FOV,orig_cellID,region_name,RNA_experiment_ID,DNA_experiment_ID,Sample_ID
