# Import module

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

Or from the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

## ImageAnalysis3 and basic modules

In [2]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
from ImageAnalysis3 import *
from ImageAnalysis3.classes import _allowed_kwds

import h5py
import ast
import pandas as pd

print(os.getpid())

19016


## Chromatin_analysis_tools etc

See **functions** in the repository for [AnalysisTool_Chromatin](../../README.md)

In [3]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

# Define folders

In [4]:
# main folder for postanalysis
postanalysis_folder = r'L:\Shiwei\postanalysis_2024\v0'
# input files for postanalysis
input_folder = os.path.join(postanalysis_folder, 'resources_from_preprocess')

# output file to be generated
output_main_folder = os.path.join(postanalysis_folder, 'nucleus_and_chromosome')

output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

# make new folder if needed
make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')

Use existing analysis folder: L:\Shiwei\postanalysis_2024\v0\nucleus_and_chromosome\analysis.
Use existing figure folder: L:\Shiwei\postanalysis_2024\v0\nucleus_and_chromosome\figures.


# Plotting parameters

In [5]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

In [6]:
# Other required plotting parameters
_dpi = 300
_font_size = 7
_page_width = 5.5


## cell type color-codes

In [7]:
# cell labels from RNA-MERFISH and celltype prediction
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]
# cell palette from RNA-MERFISH UMAP and stats
celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


In [8]:
# this is the plotting order noted based on the snRNA transcriptional acitivty if needed
sorted_cellplot_order_byRNA = ['Micro', 'Oligo', 'Endo', 'OPC', 'Astro', 'Vip', 'Lamp5',
                  'L5/6 NP', 'Sst', 'Sncg', 'Pvalb', 'L4/5 IT', 'L6 CT',
                  'L6 IT', 'L6b', 'L2/3 IT', 'L5 IT', 'L5 ET']

# Load data relevant information

## load and format codebook

[merged codebook](../resources/merged_codebook.csv) as in the repository (merged for all DNA-MERFISH libraries)

In [9]:
# Load codebook 
codebook_fname = os.path.join(input_folder,'merged_codebook.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)

# sort df by chr and chr_order
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)
codebook_df.head()

Unnamed: 0,name,id,NDB_784,NDB_755,NDB_826,NDB_713,NDB_865,NDB_725,NDB_817,NDB_710,...,NDB_479,NDB_562,NDB_608,NDB_460,NDB_563,NDB_592,NDB_368,NDB_436,NDB_629,NDB_604
0,1:3742742-3759944,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1:6245958-6258969,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1:8740008-8759916,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,1:9627926-9637875,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1017,1:9799472-9811359,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [10]:
# Format the chr loci name by 
# 1. changing loci name format
# 2. extract relevant information such as id, chr, chr_order, and library etc
from gene_to_loci import loci_pos_format
loci_name_list = list(map(loci_pos_format, codebook_df['name'].tolist()))
loci_name_arr = np.array(loci_name_list)

# convert to a new dataframe and set loci name as index
codebook_df = codebook_df[['name','id','chr','chr_order','library']]
codebook_df['loci_name'] = list(loci_name_arr[:,0])
codebook_df = codebook_df.set_index ('loci_name')

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13


# Load Chr2Zxys coordinate dict 

The below can be generated using jupyter notebook as below:

[preprocess/2_dna_merfish/scripts/2_spot_pick/4_summarize_jie_to_dict](../../../preprocess/2_dna_merfish/scripts/2_spot_pick/4_summarize_jie_to_dict.ipynb)

In [11]:
subclass_2_chr2Zxys_filename = os.path.join(input_folder, 'subclass_2_chr2Zxys.pkl')
subclass_2_chr2ZxysList = pickle.load(open(subclass_2_chr2Zxys_filename, 'rb'))

In [12]:
subclass_2_chr2ZxysList.keys()

dict_keys(['Oligo', 'L5 IT', 'Micro', 'Peri', 'Endo', 'Astro', 'OPC', 'L6 CT', 'L5 ET', 'L5/6 NP', 'Pvalb', 'L6 IT', 'Lamp5', 'L6b', 'Sst', 'SMC', 'L4/5 IT', 'L2/3 IT', 'Vip', 'Sncg', 'VLMC'])

In [14]:
# subclass info
subclass_2_cellInfoList_filename = os.path.join(input_folder, 'subclass_2_cellInfo.pkl')
subclass_2_cellInfoList = pickle.load(open(subclass_2_cellInfoList_filename, 'rb'))

# Calculate convex hull for single cells (dataframe version)

## define functions

In [15]:
from scipy.spatial.distance import euclidean, cdist, pdist
from scipy.spatial import ConvexHull

In [16]:
# quick get cell pts as zxy arr
def chrZXY_to_cell_pts_arr (chrZxysList_cell:dict):
    cell_pts = []
    for _chrom, _chr_zxys in chrZxysList_cell.items():
        for _fiberidx, _ichr_zxys in enumerate(_chr_zxys):
            cell_pts.extend(_ichr_zxys)
    return cell_pts
    
# generate cell pts df with region names
from AnalysisTool_Chromatin import loci_1d_features
importlib.reload(loci_1d_features)  

def chrZXY_to_cell_pts_dataframe (chrZxysList_cell:dict, 
                                  codebook_df:pd.core.frame.DataFrame, 
                                  cellgroup = 'None'):
    
    output_df_list = []
    for _chrom, _chr_zxys in chrZxysList_cell.items():
        for _fiberidx, _ichr_zxys in enumerate(_chr_zxys):
            _fiber_df = pd.DataFrame(_ichr_zxys,columns = ['z','x','y'],dtype=np.float64)
            _fiber_df['chr']=_chrom
            _fiber_df['fiberidx']=_fiberidx+1
            # get other info from codebook
            _info_df = codebook_df[codebook_df['chr']==_chrom].copy(deep=True)
            _info_df = loci_1d_features.sort_loci_df_by_chr_order(_info_df)
            _info_df['chr_order'] = _info_df['chr_order'].map(lambda x: int(x))
            _info_df.reset_index(inplace=True) # rest index to match so cols can be assigned
            if len(_info_df) == len(_fiber_df):
                _fiber_df[['name','chr_order']]=_info_df[['name','chr_order']]
            else:
                print ('Number of loci does NOT match between input dict and codebook.')
                #return None
            output_df_list.append(_fiber_df)
    
    output_df = pd.concat(output_df_list)
    output_df['celltype']=cellgroup
    return output_df

In [17]:
def cell_volume_by_chrZxys_dataframe (chr_2_zxys_list, cell_infoList_class,
                                 verbose=False):
    
    from scipy.spatial import ConvexHull
    
    # init dict and list to store results
    _out_cell_dict = {'uid': {},
               'subclass': {},      
           'nucleus_volume': {},
           'total_spots': {},
                     }
    
    # loop through cells
    for _cell_idx, _cell_chr_2_zxys_dict in enumerate(chr_2_zxys_list):
        
        _out_cell_dict['uid'][_cell_idx] = cell_infoList_class[_cell_idx]['uid']
        _out_cell_dict['subclass'][_cell_idx] = cell_infoList_class[_cell_idx]['subclass']
        # init list to combine all chrZxy
        combined_cell_chrZxy_list=[]
        # loop through chr
        for _chr_key, _chr_zxys in _cell_chr_2_zxys_dict.items():
            # loop through haploid
            for _ichr_zxys in _chr_zxys:
                # add each zxy
                for _zxys in _ichr_zxys:
                    combined_cell_chrZxy_list.append(_zxys)
        # use all non-nan points to calculate convex hull            
        nonnan_index =np.where(np.sum(~np.isnan(combined_cell_chrZxy_list)>0,axis=1))
        total_spots_num = len(nonnan_index[0])
        _out_cell_dict['total_spots'][_cell_idx] = total_spots_num
        # convex hull
        points=np.array(combined_cell_chrZxy_list)[nonnan_index]
        hull = ConvexHull(points)  
        _out_cell_dict['nucleus_volume'][_cell_idx]=hull.volume

    _out_cell_df = pd.DataFrame(_out_cell_dict)
    _out_cell_df.set_index('uid', drop=True,inplace=True)

    return _out_cell_df

## process

In [18]:
import tqdm
_out_cell_df_list = []

for _class, chr_2_zxys_list in tqdm.tqdm(subclass_2_chr2ZxysList.items()):
    
    print (f'Analyze cell hull volume for {_class}', end=' ')
    cell_infoList_class = subclass_2_cellInfoList[_class]
    
    _start_time = time.time()
    _out_cell_df = cell_volume_by_chrZxys_dataframe (chr_2_zxys_list, cell_infoList_class)
    _out_cell_df_list.append(_out_cell_df)
    print(f"in {time.time()-_start_time:.3f}s.")
    
merged_cell_volume_df = pd.concat(_out_cell_df_list)
merged_cell_volume_df

  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Analyze cell hull volume for Oligo 

  5%|███▉                                                                               | 1/21 [00:21<07:12, 21.63s/it]

in 21.630s.
Analyze cell hull volume for L5 IT 

 10%|███████▉                                                                           | 2/21 [00:30<04:33, 14.38s/it]

in 9.299s.
Analyze cell hull volume for Micro 

 14%|███████████▊                                                                       | 3/21 [00:36<03:06, 10.36s/it]

in 5.582s.
Analyze cell hull volume for Peri 

 19%|███████████████▊                                                                   | 4/21 [00:39<02:06,  7.45s/it]

in 2.974s.
Analyze cell hull volume for Endo 

 24%|███████████████████▊                                                               | 5/21 [00:48<02:10,  8.17s/it]

in 9.449s.
Analyze cell hull volume for Astro 

 29%|███████████████████████▋                                                           | 6/21 [01:00<02:21,  9.44s/it]

in 11.903s.
Analyze cell hull volume for OPC 

 33%|███████████████████████████▋                                                       | 7/21 [01:04<01:47,  7.71s/it]

in 4.148s.
Analyze cell hull volume for L6 CT 

 38%|███████████████████████████████▌                                                   | 8/21 [01:22<02:22, 10.98s/it]

in 17.985s.
Analyze cell hull volume for L5 ET 

 43%|███████████████████████████████████▌                                               | 9/21 [01:27<01:45,  8.81s/it]

in 4.046s.
Analyze cell hull volume for L5/6 NP 

 48%|███████████████████████████████████████                                           | 10/21 [01:29<01:15,  6.84s/it]

in 2.425s.
Analyze cell hull volume for Pvalb 

 52%|██████████████████████████████████████████▉                                       | 11/21 [01:33<00:59,  5.98s/it]

in 4.025s.
Analyze cell hull volume for L6 IT 

 57%|██████████████████████████████████████████████▊                                   | 12/21 [01:41<00:58,  6.45s/it]

in 7.534s.
Analyze cell hull volume for Lamp5 

 62%|██████████████████████████████████████████████████▊                               | 13/21 [01:42<00:39,  4.97s/it]

in 1.546s.
Analyze cell hull volume for L6b 

 67%|██████████████████████████████████████████████████████▋                           | 14/21 [01:45<00:30,  4.42s/it]

in 3.152s.
Analyze cell hull volume for Sst 

 71%|██████████████████████████████████████████████████████████▌                       | 15/21 [01:48<00:23,  3.99s/it]

in 3.009s.
Analyze cell hull volume for SMC 

 76%|██████████████████████████████████████████████████████████████▍                   | 16/21 [01:50<00:16,  3.27s/it]

in 1.597s.
Analyze cell hull volume for L4/5 IT 

 81%|██████████████████████████████████████████████████████████████████▍               | 17/21 [02:00<00:21,  5.39s/it]

in 10.315s.
Analyze cell hull volume for L2/3 IT 

 86%|██████████████████████████████████████████████████████████████████████▎           | 18/21 [02:11<00:21,  7.05s/it]

in 10.907s.
Analyze cell hull volume for Vip 

 90%|██████████████████████████████████████████████████████████████████████████▏       | 19/21 [02:12<00:10,  5.30s/it]

in 1.222s.
Analyze cell hull volume for Sncg 

 95%|██████████████████████████████████████████████████████████████████████████████    | 20/21 [02:13<00:03,  3.85s/it]

in 0.460s.
Analyze cell hull volume for VLMC 

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [02:14<00:00,  6.40s/it]

in 1.171s.





Unnamed: 0_level_0,subclass,nucleus_volume,total_spots
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
259202492748634617304623818845147108919,Oligo,255.043678,863
336074706103419484501535095917410831417,Oligo,298.589934,1426
179117357717369278884189379535888871018,Oligo,305.797609,1601
36321914521561890358202806077673807953,Oligo,76.777445,210
92271180002801802991585284299263797402,Oligo,376.983590,2034
...,...,...,...
89559371323596133331060964655277765256,VLMC,183.802964,584
144314319466287448491677682735506973,VLMC,92.940088,387
71119619609696733523981100580894538756,VLMC,237.601248,734
29724298272863708674304529591452401156,VLMC,210.980658,1018


## save

In [20]:
output_fname = os.path.join(output_analysis_folder, f'nucleus_volume_convexhull_bysubclass.csv')

merged_cell_volume_df.to_csv(output_fname)