# Import packages

In [2]:
import os

import h5py
import sys
import numpy as np
import pandas as pd
import copy
import tqdm
import matplotlib.pyplot as plt

import ast
sys.path.append(r"C:\Users\shiwei\Documents")

print(os.getpid())

42352


# Define functions

## functions to convert data format

In [2]:
## function to convert
def format_4dn_dataframe_adata (adata_analyzed, adata_raw,required_cols,experiment_key=None):
    
    from tqdm import tqdm
    
    
    # 1. get and rename required column from analyzed adata
    if isinstance(experiment_key,type(None)) and 'experiment' in adata_analyzed.obs.columns:
        experiment_key=np.unique(adata_analyzed.obs['experiment'])[0]
    else:
        pass
    adata_analyzed = adata_analyzed[adata_analyzed.obs['experiment']==experiment_key]
    print (f'Process Experiment_{experiment_key} from with {len(adata_analyzed)} cells.')
    
    df_4dn = adata_analyzed.obs[required_cols]
    df_4dn.index.name = 'Cell_ID'
    rename_dict = {'fov': 'FOV_ID', 
                   'volume':'cell_volume_from_merlin', 
                   'center_x':'cell_center_x_global',
                   'center_y':'cell_center_y_global',
                   'experiment':'RNA_experiment_ID',
                   'subclass_label_new':'cluster_subclass',
                   'class_label_new':'cluster_class',
                   'neuron_identity':'neuron_identity'}
    df_4dn = df_4dn.rename(columns=rename_dict)
    
    # 2. get gene count column from raw adata
    adata_raw = adata_raw[adata_raw.obs['experiment']==experiment_key]
    adata_raw_count = adata_raw.X
    adata_raw_count = pd.DataFrame(adata_raw.X, columns=adata_raw.var_names,index=adata_raw.obs_names)
    adata_raw_count.index.name='Cell_ID'
    sel_adata_raw_count = adata_raw_count.loc[df_4dn.index]
    if len(sel_adata_raw_count)==len(df_4dn):
        print (f'Retrieve RNA raw counts from {len(sel_adata_raw_count)} cells')
        df_4dn=pd.concat([df_4dn, sel_adata_raw_count], axis=1)
        
    # 3. reset index
    df_4dn.reset_index(inplace=True)
    
    return df_4dn


#import multiprocessing as mp
#from functools import partial
#import inspect


## prepare header lines

In [4]:
table_name_list = ['4dn_FOF-CT_core', '4dn_FOF-CT_cell', '4dn_FOF-CT_rna_spot', '4dn_FOF-CT_demultiplexing']
curr_table_name = '4dn_FOF-CT_cell'
add_table_name = ', '.join([t for t in table_name_list if t!=curr_table_name])
add_table_name

'4dn_FOF-CT_core, 4dn_FOF-CT_rna_spot, 4dn_FOF-CT_demultiplexing'

In [7]:
header_dict_v0 = {'##FOF-CT_version=':'v0.1',
                  '##Table_namespace=':curr_table_name,
                  '#lab_name:':'Xiaowei Zhuang lab', 
                  '#experimenter_name:':'Pu Zheng, Shiwei Liu',
                  '#experimenter_contact:':'pu_zheng@g.harvard.edu, shiweiliu@fas.harvard.edu',
                  '#experiment_description:':'This experiment set contains integrated RNA- and DNA-MERFISH of the mouse MOp region and a portion of adjacent regions',
                  '#table_description:':'This table contains RNA counts of interest in inidividual cells from the mouse MOp region and a portion of adjacent regions',
                  '#Software_Title:':'Merlin, Scanpy',
                  '#Software_Type:':'SpotLoc + Decoding (by Merlin), RNA preprocessing and RNA-based cluster analysis (by Scanpy)',
                  '#Software_Authors:':'George Emanuel, Stephen Eichhorn, Leonardo Sepulveda (Merlin)',
                  '#Software_Description:':'Merlin for RNA-MERFISH decoding',
                  '#Software_Repository:':'https://github.com/emanuega/MERlin, https://github.com/scverse/scanpy',
                  '#Software_PreferredCitationID:':'doi:10.5281/zenodo.3758540, https://doi.org/10.1186/s13059-017-1382-0',
                  #'#additional_tables:':'4dn_FOF-CT_rna, 4dn_FOF-CT_quality, 4dn_FOF-CT_bio, 4dn_FOF-CT_demultiplexing, 4dn_FOF-CT_trace, 4dn_FOF-CT_cell',
                  '#additional_tables:':add_table_name,
                  '##genome_assembly=':'GRCm38(mm10)',
                  '##XYZ_unit=':'micron',
                  '##columns=':'(Cell_ID, cell_volume_from_merlin, cell_center_x_global,cell_center_y_global, experiment_ID, cluster_subclass,cluster_class, neuron_identity, RNA count for gene A,B,...)',
                  '#Cell_ID:':'unique cell identifier across the datatables',
                  '#cell_center_x_global:':'x coordinate of the cell relative to all cells for the experiment replicate',
                  '#cell_center_y_global:':'y coordinate of the cell relative to all cells for the experiment replicate',
                  '#cluster_subclass:':'transcriptionally defined cell cluster lable at subclass level',
                  '#cluster_class:':'transcriptionally defined cell cluster lable at a higer level',
                  '#RNA count for gene A:':'decoded raw RNA count for gene A (e.g., gene 1700022I11Rik)'
                 }


In [8]:
# convert header lines to dataframe
header_lines = [k+v for k,v in header_dict_v0.items()]
header_lines_df = pd.DataFrame(header_lines)
header_lines_df

Unnamed: 0,0
0,##FOF-CT_version=v0.1
1,##Table_namespace=4dn_FOF-CT_cell
2,#lab_name:Xiaowei Zhuang lab
3,"#experimenter_name:Pu Zheng, Shiwei Liu"
4,"#experimenter_contact:pu_zheng@g.harvard.edu, ..."
5,#experiment_description:This experiment set co...
6,#table_description:This table contains RNA cou...
7,"#Software_Title:Merlin, Scanpy"
8,"#Software_Type:SpotLoc + Decoding (by Merlin),..."
9,"#Software_Authors:George Emanuel, Stephen Eich..."


# Load and convert anndata

## define parameters for experiment 

In [9]:
# define name and other metadata info in correct order for each experiment below
#exp_name_list = ['20220316exp', '20220402exp', '20220419exp', '20220713exp']
RNA_experiment_ID_list = ['20220304','20220329','20220415','20220418']
#DNA_experiment_ID_list = ['20220316','20220402','20220419','20220713']
sample_ID_list = ['C57BL/6_M_1_MOp_1','C57BL/6_M_2_MOp_1','C57BL/6_M_3_MOp_1','C57BL/6_M_3_MOp_2']

exp_sample_type = 'WT' # to further distinguish the output exp name in case the date is the same
# naming for 4DN alias
Bio_num_list = [1,2,3,3]
Tech_num_list = [1,1,1,2]
table_type = curr_table_name.split('4dn_FOF-CT_')[1]


# output folder to save the converted df
output_folder =r'F:\4DN_deposit\new_version'

In [10]:
# required cols from the data to be processed
required_cols = ['fov', 'volume', 'center_x','center_y','experiment',
       'subclass_label_new','class_label_new','neuron_identity']

## load analyzed rna-merfish data

In [11]:
rna_folder = r'L:\Shiwei\RNA_MERFISH_analysis\Merged_nonclear'
analyzed_adata_fname = os.path.join(rna_folder,'new_labeled_data.h5ad')

import scanpy as sc
adata_analyzed=sc.read(analyzed_adata_fname)

## load raw data from the merlin output

In [12]:
import anndata

# raw count from merlin
# raw count can also be prepared if you have merged adata for that for example
data_folders= [r'\\10.245.74.158\Chromatin_NAS_8\MERFISH\v2\20220304-storm6_M1', 
               r'\\10.245.74.158\Chromatin_NAS_8\MERFISH\v2\20220329-storm6_M1', 
               r'\\10.245.74.158\Chromatin_NAS_8\MERFISH\v2\20220415-storm65', 
               r'\\10.245.74.158\Chromatin_NAS_8\MERFISH\v2\20220418-storm6',
              ]

adata_list = []
for _data_folder in data_folders:
    
    cell_meta_data_file = os.path.join(_data_folder, 'ExportCellMetadata', 'feature_metadata.csv')
    cell_meta_data = pd.read_csv(cell_meta_data_file)
    counts_file_mer = os.path.join(_data_folder, 'ExportPartitionedBarcodes', 'barcodes_per_feature.csv')
    df = pd.read_csv(counts_file_mer).rename(columns={'Unnamed: 0':'index'}).set_index('index')
    for _c in df.columns:
        if 'Blank-' in _c:
            df = df.drop(columns=_c)
    adata = anndata.AnnData(df)
    adata.obs = adata.obs.merge(cell_meta_data, left_index=True, right_on='Unnamed: 0').set_index('Unnamed: 0')
    adata.obs['experiment'] = os.path.basename(_data_folder).split('-')[0]
    
    #_postanalysis_folder = os.path.join(_data_folder, 'PostAnalysis')
    #_save_filename = os.path.join(os.path.join(_postanalysis_folder, 'filtered_cluster_data.h5ad'))
    #_data = sc.read(_save_filename)
    adata_list.append(adata)
    
adata_raw = anndata.concat(adata_list, )    


  adata = anndata.AnnData(df)
  adata = anndata.AnnData(df)
  adata = anndata.AnnData(df)
  adata = anndata.AnnData(df)


## process

In [13]:
# loop all replicate
for _exp_ind, _exp_name in enumerate(RNA_experiment_ID_list):
    
    experiment_name_key= RNA_experiment_ID_list[_exp_ind] + '_' + exp_sample_type
    Bio_num = Bio_num_list[_exp_ind]
    Tech_num = Tech_num_list[_exp_ind]
    data_4dn_savename = os.path.join(output_folder, f'fileproc_genome_wide_brain_ct_B{Bio_num}_T{Tech_num}_{table_type}_f1.csv')
    
    if os.path.exists(data_4dn_savename):
        print (f'Output already exists for {experiment_key}. Skip.')
        
    else:
        df_4dn = format_4dn_dataframe_adata (adata_analyzed, adata_raw,required_cols,experiment_key=_exp_name)
        df_4dn.insert(6, 'Sample_ID', sample_ID_list[_exp_ind])
        # modify column names and add 4dn header lines
        header_lines_df_fill = header_lines_df.reindex(columns = list(np.arange(0,len(df_4dn.columns))))
        df_4dn_new = pd.DataFrame(np.vstack([df_4dn.columns,df_4dn]))
        df_4dn_clean = pd.concat([header_lines_df_fill,df_4dn_new])
        
        df_4dn_clean.to_csv(data_4dn_savename, header=None, index=False)
        print(f'Result saved.')
        

Process Experiment_20220304 from with 17856 cells.
Retrieve RNA raw counts from 17856 cells
Result saved.
Process Experiment_20220329 from with 13383 cells.
Retrieve RNA raw counts from 13383 cells
Result saved.
Process Experiment_20220415 from with 16760 cells.
Retrieve RNA raw counts from 16760 cells
Result saved.
Process Experiment_20220418 from with 14733 cells.
Retrieve RNA raw counts from 14733 cells
Result saved.


In [14]:
df_4dn

Unnamed: 0,Cell_ID,FOV_ID,cell_volume_from_merlin,cell_center_x_global,cell_center_y_global,RNA_experiment_ID,Sample_ID,cluster_subclass,cluster_class,neuron_identity,...,Unc13c,Unc5b,Unc5d,Ust,Vipr2,Vtn,Vwc2,Wipf3,Wnt7b,Zfp804b
0,102234820813605596443261069649938311266,0,331.958507,-1544.349973,835.380014,20220418,C57BL/6_M_3_MOp_2,Oligo,Oligo,Non-Neuronal,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,106948495560734704290624946485686169930,0,888.207581,-1524.477972,965.898018,20220418,C57BL/6_M_3_MOp_2,Oligo,Oligo,Non-Neuronal,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,112222019090017532719270731442181879359,0,1630.954810,-1500.501971,827.442014,20220418,C57BL/6_M_3_MOp_2,L6b,Gluta,Neuronal,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,38.0,0.0,0.0
3,116703671006753370581678359143865340850,0,374.513680,-1624.863975,779.544013,20220418,C57BL/6_M_3_MOp_2,Oligo,Oligo,Non-Neuronal,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,117433500093892789984785934662224853977,0,685.967690,-1515.891972,863.730015,20220418,C57BL/6_M_3_MOp_2,Oligo,Oligo,Non-Neuronal,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14728,216743509822996359971425086529065233172,199,1524.797824,756.829978,-466.464022,20220418,C57BL/6_M_3_MOp_2,other,other,other,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
14729,235791795914903367868231154493525204064,199,1391.762666,811.045979,-394.590020,20220418,C57BL/6_M_3_MOp_2,other,other,other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14730,281137009595534032327996677664017059078,199,1145.897197,816.877979,-425.208021,20220418,C57BL/6_M_3_MOp_2,other,other,other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14731,53631180073186654291241335400076244597,199,1751.923793,783.721978,-462.360022,20220418,C57BL/6_M_3_MOp_2,other,other,other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
