# Import module

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

Or from the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

## ImageAnalysis3 and basic modules

In [1]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
from ImageAnalysis3 import *
from ImageAnalysis3.classes import _allowed_kwds

import h5py
import ast
import pandas as pd

print(os.getpid())

40320


## Chromatin_analysis_tools etc

See **functions** in the repository for [AnalysisTool_Chromatin](../../README.md)

In [2]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

# Define folders

In [3]:
# main folder for postanalysis
postanalysis_folder = r'L:\Shiwei\postanalysis_2024\v0'
# input files for postanalysis
input_folder = os.path.join(postanalysis_folder, 'resources_from_preprocess')

# output file to be generated
output_main_folder = os.path.join(postanalysis_folder, 'radial_position')

output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

# make new folder if needed
make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')

Use existing analysis folder: L:\Shiwei\postanalysis_2024\v0\radial_position\analysis.
Use existing figure folder: L:\Shiwei\postanalysis_2024\v0\radial_position\figures.


# Plotting parameters

In [4]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

In [5]:
# Other required plotting parameters
_dpi = 300
_font_size = 7
_page_width = 5.5


## cell type color-codes

In [6]:
# cell labels from RNA-MERFISH and celltype prediction
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]
# cell palette from RNA-MERFISH UMAP and stats
celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


In [7]:
# this is the plotting order noted based on the snRNA transcriptional acitivty if needed
sorted_cellplot_order_byRNA = ['Micro', 'Oligo', 'Endo', 'OPC', 'Astro', 'Vip', 'Lamp5',
                  'L5/6 NP', 'Sst', 'Sncg', 'Pvalb', 'L4/5 IT', 'L6 CT',
                  'L6 IT', 'L6b', 'L2/3 IT', 'L5 IT', 'L5 ET']

# Load data relevant information

## load and format codebook

[merged codebook](../resources/merged_codebook.csv) as in the repository (merged for all DNA-MERFISH libraries)

In [8]:
# Load codebook 
codebook_fname = os.path.join(input_folder,'merged_codebook.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)

# sort df by chr and chr_order
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)
codebook_df.head()

Unnamed: 0,name,id,NDB_784,NDB_755,NDB_826,NDB_713,NDB_865,NDB_725,NDB_817,NDB_710,...,NDB_479,NDB_562,NDB_608,NDB_460,NDB_563,NDB_592,NDB_368,NDB_436,NDB_629,NDB_604
0,1:3742742-3759944,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1:6245958-6258969,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1:8740008-8759916,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,1:9627926-9637875,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1017,1:9799472-9811359,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
# Format the chr loci name by 
# 1. changing loci name format
# 2. extract relevant information such as id, chr, chr_order, and library etc
from gene_to_loci import loci_pos_format
loci_name_list = list(map(loci_pos_format, codebook_df['name'].tolist()))
loci_name_arr = np.array(loci_name_list)

# convert to a new dataframe and set loci name as index
codebook_df = codebook_df[['name','id','chr','chr_order','library']]
codebook_df['loci_name'] = list(loci_name_arr[:,0])
codebook_df = codebook_df.set_index ('loci_name')

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13


## Load Chr2Zxys dict and medianDict

Data can be generated from notebook

[preprocess/2_dna_merfish/scripts/2_spot_pick/4_summarize_jie_to_dict](../../preprocess/2_dna_merfish/scripts/2_spot_pick/4_summarize_jie_to_dict.ipynb)

In [10]:
postanalysis_folder = r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_20230201\MOp_WT_postanalysis\postanalysis_vCW2_sorted'
subclass_2_chr2Zxys_filename = os.path.join(postanalysis_folder, 'subclass_2_chr2Zxys.pkl')
subclass_2_chr2ZxysList = pickle.load(open(subclass_2_chr2Zxys_filename, 'rb'))

In [11]:
subclass_2_chr2ZxysList.keys()

dict_keys(['Oligo', 'L5 IT', 'Micro', 'Peri', 'Endo', 'Astro', 'OPC', 'L6 CT', 'L5 ET', 'L5/6 NP', 'Pvalb', 'L6 IT', 'Lamp5', 'L6b', 'Sst', 'SMC', 'L4/5 IT', 'L2/3 IT', 'Vip', 'Sncg', 'VLMC'])

In [12]:
# subclass info
subclass_2_cellInfoList_filename = os.path.join(postanalysis_folder, 'subclass_2_cellInfo.pkl')
subclass_2_cellInfoList = pickle.load(open(subclass_2_cellInfoList_filename, 'rb'))

# Calculate radial position for cell (dataframe version)

## define functions

In [13]:
from scipy.spatial.distance import euclidean, cdist, pdist
from scipy.spatial import ConvexHull

In [14]:
# quick get cell pts as zxy arr
def chrZXY_to_cell_pts_arr (chrZxysList_cell:dict):
    cell_pts = []
    for _chrom, _chr_zxys in chrZxysList_cell.items():
        for _fiberidx, _ichr_zxys in enumerate(_chr_zxys):
            cell_pts.extend(_ichr_zxys)
    return cell_pts
    
# generate cell pts df with region names
from AnalysisTool_Chromatin import loci_1d_features
importlib.reload(loci_1d_features)  

def chrZXY_to_cell_pts_dataframe (chrZxysList_cell:dict, 
                                  codebook_df:pd.core.frame.DataFrame, 
                                  cellgroup = 'None'):
    
    output_df_list = []
    for _chrom, _chr_zxys in chrZxysList_cell.items():
        for _fiberidx, _ichr_zxys in enumerate(_chr_zxys):
            _fiber_df = pd.DataFrame(_ichr_zxys,columns = ['z','x','y'],dtype=np.float64)
            _fiber_df['chr']=_chrom
            _fiber_df['fiberidx']=_fiberidx+1
            # get other info from codebook
            _info_df = codebook_df[codebook_df['chr']==_chrom].copy(deep=True)
            _info_df = loci_1d_features.sort_loci_df_by_chr_order(_info_df)
            _info_df['chr_order'] = _info_df['chr_order'].map(lambda x: int(x))
            _info_df.reset_index(inplace=True) # rest index to match so cols can be assigned
            if len(_info_df) == len(_fiber_df):
                _fiber_df[['name','chr_order']]=_info_df[['name','chr_order']]
            else:
                print ('Number of loci does NOT match between input dict and codebook.')
                #return None
            output_df_list.append(_fiber_df)
    
    output_df = pd.concat(output_df_list)
    output_df['celltype']=cellgroup
    return output_df
                

In [15]:
np.unique(codebook_df['chr'])

array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
       '2', '3', '4', '5', '6', '7', '8', '9', 'X', 'Y'], dtype=object)

In [16]:
# method to get interection radius for normalization
# convert hull center to 0,0,0 first
def get_intersection(U,hull):
    eq=hull.equations.T
    V,b=eq[:-1],eq[-1]
    alpha=-b/np.dot(U,V)
    return U*np.full(np.array(U).shape, np.min(alpha[alpha>0]))

In [17]:
def Radial_position_dataframe (chr_2_zxys_list,
                                  cell_infoList_class,
                                 codebook_df,
                                 #function='nanmedian', axis=0, 
                                #data_num_th=100,  # nonnan data num th for nanmedian along axis
                                #Zxys_num_th=500,  # total nonnan data Zxys for using convex hull 
                                #summarize_result=True,
                                 verbose=False):
    
    from scipy.spatial.distance import euclidean, cdist, pdist
    from scipy.spatial import ConvexHull
    import math
    
    # 1. prepare dict to store loci-2-center dist results
    _out_cell_dict = {'uid': {},
                      'subclass': {},      
                      'total_spots': {},
                      'radial_position':{}
                     }
    # make sure is sorted to match the order and has the loci name as index
    codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)
    # loop through cells
    for _cell_idx, _cell_chr_2_zxys_dict in enumerate(chr_2_zxys_list):
        
        _out_cell_dict['uid'][_cell_idx] = cell_infoList_class[_cell_idx]['uid']
        _out_cell_dict['subclass'][_cell_idx] = cell_infoList_class[_cell_idx]['subclass']
        # 1.1. get nuclei centroid and radius
        # 1.1. get nuclei centroid and radius
        combined_cell_chrZxy_list = chrZXY_to_cell_pts_arr(_cell_chr_2_zxys_dict)
        # use all non-nan points to calculate convex hull            
        nonnan_index =np.where(np.sum(~np.isnan(combined_cell_chrZxy_list)>0,axis=1))
        total_spots_num = len(nonnan_index[0])
        _out_cell_dict['total_spots'][_cell_idx] = total_spots_num
        # convex hull
        points=np.array(combined_cell_chrZxy_list)[nonnan_index]
        hull = ConvexHull(points)  
        # get hull centroid
        cz = np.mean(hull.points[hull.vertices,0])
        cx = np.mean(hull.points[hull.vertices,1])
        cy = np.mean(hull.points[hull.vertices,2])
        # attention: zxy in correct order!!
        _hull_ct_zxy = np.array([cz,cx,cy])
        
        # update point and hull
        points = points - _hull_ct_zxy
        hull = ConvexHull(points)

        # 1.2. caluclate normalized dist to nuclei centroid for radial positioning if applicable
        _cell_radial_position_dict = {#'chr': [],
                                      #'chr_order': [],
                                      'loci_name': [], # edit to use chr/chr_order or loci_name depending on the codebook format
                                      'fiber_id': [],
                                      'norm_radial_position': [],}
        # loop through chr
        for _chr_key in np.unique(codebook_df['chr']):
            codebook_df_chr = codebook_df[codebook_df['chr']==_chr_key]
            # it's possible this way, some loci will not exist in the final output, use the codebook to assign nan for these loci if so
            # otherwise, add empty result for each chr
            if _chr_key in _cell_chr_2_zxys_dict.keys(): 
                _chr_zxys = _cell_chr_2_zxys_dict[_chr_key]
                ###########################################
                # calculate normalized radial pos for each fiber
                for _ichr_idx, _ichr_zxys in enumerate(_chr_zxys):
                    ichr_zxys = _ichr_zxys - _hull_ct_zxy 
                    # define output
                    _loci_2_center_list = []
                    _loci_radius_list = []
                    #loop through every point
                    for _zxys in ichr_zxys:
                        if len(np.where(np.isnan(_zxys))[0])>0:
                            _loci_2_center_list.append(np.nan)
                            _loci_radius_list.append(np.nan)
                        else:
                            _loci_2_center_list.append(euclidean(_zxys, [0,0,0]))
                            edge_pt = get_intersection(_zxys, hull)
                            _loci_radius_list.append(euclidean(edge_pt, [0,0,0]))
                    # normalize each loci on the fiber
                    _loci_2_center_list = np.array(_loci_2_center_list)/np.array(_loci_radius_list)
                    # add result
                    _cell_radial_position_dict['loci_name'].extend(codebook_df_chr.index)
                    _cell_radial_position_dict['fiber_id'].extend([_ichr_idx+1]*len(codebook_df_chr))
                    _cell_radial_position_dict['norm_radial_position'].extend(_loci_2_center_list)
                 ##############################################
        _out_cell_dict['radial_position'][_cell_idx] = pd.DataFrame(_cell_radial_position_dict)
        
    _out_cell_df = pd.DataFrame(_out_cell_dict)
    _out_cell_df.set_index('uid', drop=True,inplace=True)
    return _out_cell_df


## process

In [18]:
import tqdm
_out_cell_df_list = []

for _class, chr_2_zxys_list in tqdm.tqdm(subclass_2_chr2ZxysList.items()):
    
    print (f'Analyze cell hull volume for {_class}', end=' ')
    cell_infoList_class = subclass_2_cellInfoList[_class]
    
    _start_time = time.time()
    _out_cell_df = Radial_position_dataframe (chr_2_zxys_list,
                                              cell_infoList_class,
                                              codebook_df)
    _out_cell_df_list.append(_out_cell_df)
    print(f"in {time.time()-_start_time:.3f}s.")

merged_radial_pos_df = pd.concat(_out_cell_df_list)
merged_radial_pos_df

  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Analyze cell hull volume for Oligo 

  5%|███▊                                                                            | 1/21 [08:44<2:54:43, 524.17s/it]

in 524.172s.
Analyze cell hull volume for L5 IT 

 10%|███████▌                                                                        | 2/21 [12:28<1:50:02, 347.51s/it]

in 223.852s.
Analyze cell hull volume for Micro 

 14%|███████████▍                                                                    | 3/21 [14:37<1:14:20, 247.79s/it]

in 129.125s.
Analyze cell hull volume for Peri 

 19%|███████████████▌                                                                  | 4/21 [15:47<50:20, 177.70s/it]

in 70.239s.
Analyze cell hull volume for Endo 

 24%|███████████████████▌                                                              | 5/21 [19:13<50:05, 187.86s/it]

in 205.890s.
Analyze cell hull volume for Astro 

 29%|███████████████████████▍                                                          | 6/21 [23:34<53:13, 212.91s/it]

in 261.541s.
Analyze cell hull volume for OPC 

 33%|███████████████████████████▎                                                      | 7/21 [25:12<40:52, 175.15s/it]

in 97.389s.
Analyze cell hull volume for L6 CT 

 38%|███████████████████████████████▏                                                  | 8/21 [32:39<56:41, 261.68s/it]

in 446.950s.
Analyze cell hull volume for L5 ET 

 43%|███████████████████████████████████▏                                              | 9/21 [34:12<41:49, 209.16s/it]

in 93.687s.
Analyze cell hull volume for L5/6 NP 

 48%|██████████████████████████████████████▌                                          | 10/21 [35:11<29:50, 162.76s/it]

in 58.868s.
Analyze cell hull volume for Pvalb 

 52%|██████████████████████████████████████████▍                                      | 11/21 [36:49<23:48, 142.82s/it]

in 97.587s.
Analyze cell hull volume for L6 IT 

 57%|██████████████████████████████████████████████▎                                  | 12/21 [39:55<23:24, 156.03s/it]

in 186.242s.
Analyze cell hull volume for Lamp5 

 62%|██████████████████████████████████████████████████▏                              | 13/21 [40:32<15:59, 119.94s/it]

in 36.917s.
Analyze cell hull volume for L6b 

 67%|██████████████████████████████████████████████████████                           | 14/21 [41:49<12:29, 107.06s/it]

in 77.276s.
Analyze cell hull volume for Sst 

 71%|██████████████████████████████████████████████████████████▌                       | 15/21 [43:01<09:38, 96.44s/it]

in 71.822s.
Analyze cell hull volume for SMC 

 76%|██████████████████████████████████████████████████████████████▍                   | 16/21 [43:36<06:30, 78.03s/it]

in 35.278s.
Analyze cell hull volume for L4/5 IT 

 81%|█████████████████████████████████████████████████████████████████▌               | 17/21 [47:46<08:37, 129.50s/it]

in 249.191s.
Analyze cell hull volume for L2/3 IT 

 86%|█████████████████████████████████████████████████████████████████████▍           | 18/21 [52:04<08:24, 168.18s/it]

in 258.241s.
Analyze cell hull volume for Vip 

 90%|█████████████████████████████████████████████████████████████████████████▎       | 19/21 [52:33<04:12, 126.38s/it]

in 29.009s.
Analyze cell hull volume for Sncg 

 95%|██████████████████████████████████████████████████████████████████████████████    | 20/21 [52:43<01:31, 91.61s/it]

in 10.556s.
Analyze cell hull volume for VLMC 

100%|█████████████████████████████████████████████████████████████████████████████████| 21/21 [53:07<00:00, 151.78s/it]

in 23.434s.





Unnamed: 0_level_0,subclass,total_spots,radial_position
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
259202492748634617304623818845147108919,Oligo,863,loci_name fiber_id norm...
336074706103419484501535095917410831417,Oligo,1426,loci_name fiber_id norm...
179117357717369278884189379535888871018,Oligo,1601,loci_name fiber_id norm...
36321914521561890358202806077673807953,Oligo,210,loci_name fiber_id norm_...
92271180002801802991585284299263797402,Oligo,2034,loci_name fiber_id norm...
...,...,...,...
89559371323596133331060964655277765256,VLMC,584,loci_name fiber_id norm...
144314319466287448491677682735506973,VLMC,387,loci_name fiber_id norm_...
71119619609696733523981100580894538756,VLMC,734,loci_name fiber_id norm...
29724298272863708674304529591452401156,VLMC,1018,loci_name fiber_id norm...


In [19]:
merged_radial_pos_df.iloc[0]['radial_position']

Unnamed: 0,loci_name,fiber_id,norm_radial_position
0,chr1_3742742_3759944,1,0.663333
1,chr1_6245958_6258969,1,
2,chr1_8740008_8759916,1,
3,chr1_9627926_9637875,1,
4,chr1_9799472_9811359,1,0.389658
...,...,...,...
1803,chr9_118751319_118759956,1,
1804,chr9_119425689_119434008,1,0.545641
1805,chr9_121252615_121259973,1,
1806,chr9_123457587_123467505,1,


## save

In [20]:
# below will compress the radial position df to str; do hdf5 save instead
#output_fname = os.path.join(output_analysis_folder, f'radial_pos_convexhull_bysubclass.csv')
#merged_radial_pos_df.to_csv(output_fname)
output_fname = os.path.join(output_analysis_folder, f'radial_pos_convexhull_bysubclass_v2.h5')
merged_radial_pos_df.to_hdf(output_fname, key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['subclass', 'radial_position'], dtype='object')]

  merged_radial_pos_df.to_hdf(output_fname, key='df', mode='w')


# Summarize for median radial position

## load

In [21]:
output_fname = os.path.join(output_analysis_folder, f'radial_pos_convexhull_bysubclass_v2.h5')

merged_radial_pos_df = pd.read_hdf(output_fname)
merged_radial_pos_df.head()

Unnamed: 0_level_0,subclass,total_spots,radial_position
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
259202492748634617304623818845147108919,Oligo,863,loci_name fiber_id norm...
336074706103419484501535095917410831417,Oligo,1426,loci_name fiber_id norm...
179117357717369278884189379535888871018,Oligo,1601,loci_name fiber_id norm...
36321914521561890358202806077673807953,Oligo,210,loci_name fiber_id norm_...
92271180002801802991585284299263797402,Oligo,2034,loci_name fiber_id norm...


In [22]:
merged_radial_pos_df.iloc[0]['radial_position']

Unnamed: 0,loci_name,fiber_id,norm_radial_position
0,chr1_3742742_3759944,1,0.663333
1,chr1_6245958_6258969,1,
2,chr1_8740008_8759916,1,
3,chr1_9627926_9637875,1,
4,chr1_9799472_9811359,1,0.389658
...,...,...,...
1803,chr9_118751319_118759956,1,
1804,chr9_119425689_119434008,1,0.545641
1805,chr9_121252615_121259973,1,
1806,chr9_123457587_123467505,1,


In [23]:
np.unique(merged_radial_pos_df['subclass'])

array(['Astro', 'Endo', 'L2/3 IT', 'L4/5 IT', 'L5 ET', 'L5 IT', 'L5/6 NP',
       'L6 CT', 'L6 IT', 'L6b', 'Lamp5', 'Micro', 'OPC', 'Oligo', 'Peri',
       'Pvalb', 'SMC', 'Sncg', 'Sst', 'VLMC', 'Vip'], dtype=object)

## define function

In [24]:
# extract radial pos from the summary df
def average_radial_pos_from_summary_df (celltype_radial_pos_df):
    _cell_radial_pos_list=[]
    for _cell_idx in range(len(celltype_radial_pos_df)):
        _cell_radial_pos = celltype_radial_pos_df.iloc[_cell_idx]['radial_position']
        _cell_radial_pos_summary = pd.DataFrame(_cell_radial_pos.groupby(by='loci_name')['norm_radial_position'].mean())
        _cell_radial_pos_summary['uid'] = celltype_radial_pos_df.iloc[_cell_idx].name
        _cell_radial_pos_list.append(_cell_radial_pos_summary)

    celltype_radial_pos_summary=pd.concat(_cell_radial_pos_list)
    return celltype_radial_pos_summary

In [25]:
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]

celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


sel_merged_radial_pos_df = merged_radial_pos_df[merged_radial_pos_df['subclass'].isin(selected_cell_labels)]


min_pts_th = 600
max_pts_th =3900
# sort plotting order by meadian of groups
df_plot = sel_merged_radial_pos_df[(sel_merged_radial_pos_df['total_spots']>=min_pts_th)&(sel_merged_radial_pos_df['total_spots']<max_pts_th)]
df_plot

Unnamed: 0_level_0,subclass,total_spots,radial_position
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
259202492748634617304623818845147108919,Oligo,863,loci_name fiber_id norm...
336074706103419484501535095917410831417,Oligo,1426,loci_name fiber_id norm...
179117357717369278884189379535888871018,Oligo,1601,loci_name fiber_id norm...
92271180002801802991585284299263797402,Oligo,2034,loci_name fiber_id norm...
237111112842863990529297306931448726772,Oligo,942,loci_name fiber_id norm...
...,...,...,...
245342798166567655308810294286385442992,VLMC,1613,loci_name fiber_id norm...
121463809145385827894569077562457697797,VLMC,913,loci_name fiber_id norm...
71119619609696733523981100580894538756,VLMC,734,loci_name fiber_id norm...
29724298272863708674304529591452401156,VLMC,1018,loci_name fiber_id norm...


## process

In [26]:
import tqdm
sorted_median_radial_pos_list = []
for _class_ind, _class in tqdm.tqdm(enumerate(selected_cell_labels)):
    #fig, ax = plt.subplots(figsize=(_single_col_width*3, _single_col_width), dpi=200)
    celltype_radial_pos_df = df_plot[df_plot['subclass']==_class]
    celltype_radial_pos_summary = average_radial_pos_from_summary_df (celltype_radial_pos_df)
    median_radial_pos = pd.DataFrame(celltype_radial_pos_summary.groupby(by='loci_name')['norm_radial_position'].median())
    sorted_median_radial_pos = codebook_df[['chr','chr_order']].copy(deep=True)
    sorted_median_radial_pos['norm_radial_position'] = median_radial_pos['norm_radial_position']
    sorted_median_radial_pos['total_cell'] = celltype_radial_pos_summary.groupby(by='loci_name')['norm_radial_position'].count()
    sorted_median_radial_pos['subclass'] = _class
    sorted_median_radial_pos_list.append(sorted_median_radial_pos)
    
sorted_median_radial_pos_all = pd.concat(sorted_median_radial_pos_list)
sorted_median_radial_pos_all

21it [03:43, 10.66s/it]


Unnamed: 0_level_0,chr,chr_order,norm_radial_position,total_cell,subclass
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr1_3742742_3759944,1,0.0,0.860136,1022.0,L2/3 IT
chr1_6245958_6258969,1,1.0,0.855358,969.0,L2/3 IT
chr1_8740008_8759916,1,2.0,0.850017,628.0,L2/3 IT
chr1_9627926_9637875,1,3.0,0.839728,960.0,L2/3 IT
chr1_9799472_9811359,1,4.0,0.826204,900.0,L2/3 IT
...,...,...,...,...,...
chrX_166247682_166259932,X,60.0,0.812004,137.0,Peri
chrX_167157164_167167452,X,61.0,0.734733,145.0,Peri
chrX_168746045_168757590,X,62.0,0.765202,134.0,Peri
chrX_169963295_170005197,X,63.0,0.739534,196.0,Peri


## save

In [27]:
output_fname = os.path.join(output_analysis_folder, f'radial_pos_convexhull_median_summary_{min_pts_th}pts_bysubclass_v2.csv')
sorted_median_radial_pos_all.to_csv(output_fname)