# Import libraries

In [1]:
import pandas as pd
import anndata
import os

# Define output

In [2]:
overwrite = True

output_folder = r'output_folder'

output_name = 'MeCP2_raw_adata.h5ad'

# Define experiment ID and bad fovs

Bad fovs are defined by manually checking the fiducial wraps of every experiment. The ones with poor alignment (usually resulting from improper focus lock) are defined as bad fovs.

In [3]:
### experiment ids are the date MERFISH experiments are started
experiment_ids = ['0722', '0724', '0808', '0809', '1027', '1029', '1101', '1103', '1105', '1107']

### mouse ids for each experiment
mouse_id = {'M1':['0722', '0724'], 'M2':['0808', '0809'],
           'Ma':['1027', '1029'], 'Mb':['1101', '1103'], 'Mc':['1105', '1107']}
mouse_age = {'M1':'2.5_month', 'M2':'2.5_month', 'Ma':'6_month', 'Mb':'6_month', 'Mc':'6_month'}

### experiment to mouse
exp_to_mouse = {}
for mouse, exps in mouse_id.items():
    for exp in exps:
        exp_to_mouse[exp] = mouse

In [4]:
### Define bad fovs
bad_fovs = {'0722':[],
           '0724':[0,1,2,5,6],
           '0808':[],
           '0809':[],
           '1027':[0, 1, 55, 157, 158],
           '1029':[0, 123, 54, 55, 56, 57, 172, 224, 78],
           '1101':[57, 73, 172, 0],
           '1103':[0,49,103,158],
           '1105':[68, 133],
           '1107':[63,107,110,130,186,195,203,205,247,253]}

# Define files for cell meta and gene expression data

In [5]:
# each folder contains the MERlin output
analysis_parent_folder = r'MERFISH_analysis'

analysis_folder = {}
for exp in experiment_ids:
    _fd = os.path.join(analysis_parent_folder, 'Mecp2_'+exp)
    # adjust folder suffix for certain replicates
    if not os.path.exists(_fd):
        _fd = os.path.join(analysis_parent_folder, 'Mecp2_'+exp+'_p1')
    analysis_folder[exp] = _fd

In [6]:
cell_metadata_csv = {}
barcode_csv = {}
for exp, fd in analysis_folder.items():
    cell_metadata_csv[exp] = os.path.join(fd, 'ExportCellMetadata', 'feature_metadata.csv')
    barcode_csv[exp] = os.path.join(fd, 'ExportPartitionedBarcodes', 'barcodes_per_feature.csv')

# Load data

In [10]:
df_cell_list = []
for exp, cell_file in cell_metadata_csv.items():
    if os.path.exists(cell_file):
        _df = pd.read_csv(cell_file).rename(columns={'Unnamed: 0':'uid'}).set_index('uid')
        _df = _df[~_df.fov.isin(bad_fovs[exp])].copy()
        _df['experiment'] = exp
        _df['mouse'] = exp_to_mouse[exp]
        _df['age'] = mouse_age[exp_to_mouse[exp]]
        df_cell_list.append(_df.copy())
        
        num_cells = len(_df)
        print(f'MeCP2_{exp} has {num_cells} cells.')
    else:
        print(f'Data for {exp} is NOT available!')

df_cell = pd.concat(df_cell_list)

# free memory
del df_cell_list

df_cell.head()

MeCP2_0722 has 8359 cells.
MeCP2_0724 has 9036 cells.
MeCP2_0808 has 10419 cells.
MeCP2_0809 has 12224 cells.
MeCP2_1027 has 10825 cells.
MeCP2_1029 has 12065 cells.
MeCP2_1101 has 10547 cells.
MeCP2_1103 has 9804 cells.
MeCP2_1105 has 10081 cells.
MeCP2_1107 has 9966 cells.


Unnamed: 0_level_0,fov,volume,center_x,center_y,min_x,max_x,min_y,max_y,experiment,mouse,age
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
101667997425461984138916033203233536307,0,472.728414,-246.106008,-68.714001,-284.402809,-207.809207,-88.758802,-48.669201,722,M1,2.5_month
106449295704141667843638205429707704856,0,474.689483,-255.070008,61.210002,-279.866809,-230.273207,54.233202,68.186802,722,M1,2.5_month
132212124482257977606837917069895160679,0,483.730716,-256.798008,-79.946002,-293.690809,-219.905207,-124.398803,-35.493201,722,M1,2.5_month
132758886733787380228930901777346454633,0,300.552602,-231.526007,-36.854001,-256.754808,-206.297207,-48.366801,-25.3412,722,M1,2.5_month
165347171870155874748097163141548009156,0,105.266556,-312.850009,-16.01,-416.378812,-209.321207,-97.614802,65.594802,722,M1,2.5_month


In [None]:
### see how many cells are kept
kept_uids = df_cell.index.values
len(kept_uids)

In [None]:
df_barcode_list = []
for exp, barcode_file in barcode_csv.items():
    if os.path.exists(barcode_file):
        _df = pd.read_csv(barcode_file).rename(columns={'Unnamed: 0':'uid'})
        df_barcode_list.append(_df.copy())
df_barcode = pd.concat(df_barcode_list, ignore_index=True)
df_barcode = df_barcode[df_barcode['uid'].isin(kept_uids)].copy()

# free memory
del df_barcode_list

# drop blank columns
for _c in df_barcode.columns:
    if 'Blank-' in _c:
        df_barcode = df_barcode.drop(columns=_c)
        
df_barcode.set_index('uid', drop=True, inplace=True)
df_barcode.head()

In [None]:
adata = anndata.AnnData(df_barcode)
adata.obs = adata.obs.merge(df_cell, left_index=True, right_index=True)
adata.obs.head()

# Save adata

In [None]:
if overwrite:
    save_name = os.path.join(output_folder, output_name)
    adata.write_h5ad(save_name)
    print(f'Saving adata file into {save_name}')