# Import module

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

Or from the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

## ImageAnalysis3 and basic modules

In [1]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
from ImageAnalysis3 import *
from ImageAnalysis3.classes import _allowed_kwds

import h5py
import ast
import pandas as pd

print(os.getpid())

31848


## Chromatin_analysis_tools etc

See **functions** in the repository for [AnalysisTool_Chromatin](../../README.md)

In [2]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

# Define folders

In [3]:
# main folder for postanalysis
postanalysis_folder = r'L:\Shiwei\postanalysis_2024\v0'
# input files for postanalysis
input_folder = os.path.join(postanalysis_folder, 'resources_from_preprocess')

# output file to be generated
output_main_folder = os.path.join(postanalysis_folder, 'chromosome_scaling')

output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

# make new folder if needed
make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')

Use existing analysis folder: L:\Shiwei\postanalysis_2024\v0\chromosome_scaling\analysis.
Use existing figure folder: L:\Shiwei\postanalysis_2024\v0\chromosome_scaling\figures.


# Plotting parameters

In [4]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

In [5]:
# Other required plotting parameters
_dpi = 300
_font_size = 7
_page_width = 5.5


## cell type color-codess

In [6]:
# cell labels from RNA-MERFISH and celltype prediction
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]
# cell palette from RNA-MERFISH UMAP and stats
celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


In [7]:
# this is the plotting order noted based on the snRNA transcriptional acitivty if needed
sorted_cellplot_order_byRNA = ['Micro', 'Oligo', 'Endo', 'OPC', 'Astro', 'Vip', 'Lamp5',
                  'L5/6 NP', 'Sst', 'Sncg', 'Pvalb', 'L4/5 IT', 'L6 CT',
                  'L6 IT', 'L6b', 'L2/3 IT', 'L5 IT', 'L5 ET']

# Load data relevant information

## load and format codebook

[merged codebook](../resources/merged_codebook.csv) as in the repository (merged for all DNA-MERFISH libraries)

In [8]:
# Load codebook 
codebook_fname = os.path.join(input_folder,'merged_codebook.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)

# sort df by chr and chr_order
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)
codebook_df.head()

Unnamed: 0,name,id,NDB_784,NDB_755,NDB_826,NDB_713,NDB_865,NDB_725,NDB_817,NDB_710,...,NDB_479,NDB_562,NDB_608,NDB_460,NDB_563,NDB_592,NDB_368,NDB_436,NDB_629,NDB_604
0,1:3742742-3759944,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1:6245958-6258969,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1:8740008-8759916,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,1:9627926-9637875,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1017,1:9799472-9811359,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
# Format the chr loci name by 
# 1. changing loci name format
# 2. extract relevant information such as id, chr, chr_order, and library etc
from gene_to_loci import loci_pos_format
loci_name_list = list(map(loci_pos_format, codebook_df['name'].tolist()))
loci_name_arr = np.array(loci_name_list)

# convert to a new dataframe and set loci name as index
codebook_df = codebook_df[['name','id','chr','chr_order','library']]
codebook_df['loci_name'] = list(loci_name_arr[:,0])
codebook_df = codebook_df.set_index ('loci_name')

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13


# Load spatial distance from matrices

In [10]:
# Load class_2_median
# class to median distance dict
class_2_median_filename = os.path.join(input_folder, 'subclass_2_medianDict.pkl')

class_2_medianDict = pickle.load(open(class_2_median_filename, 'rb'))
print(class_2_medianDict.keys())

dict_keys(['Oligo', 'L5 IT', 'Micro', 'Peri', 'Endo', 'Astro', 'OPC', 'L6 CT', 'L5 ET', 'L5/6 NP', 'Pvalb', 'L6 IT', 'Lamp5', 'L6b', 'Sst', 'SMC', 'L4/5 IT', 'L2/3 IT', 'Vip', 'Sncg', 'VLMC'])


# Summary distances for all celltypes

In [11]:
from ImageAnalysis3.structure_tools import distance
sort_by_region=False
chr_2_indices, chr_2_orders = distance.Generate_PlotOrder(codebook_df, codebook_df, sort_by_region=False) ## load codebook


## load unnormalized distmap

In [13]:
# get unnormalized distmap
cis_distmap_dict = {}
sel_codebook = codebook_df.copy(deep=True)
for _class in class_2_medianDict.keys():

    _cis_mat, chr_edges, chr_names = distance.assemble_ChrDistDict_2_Matrix(
                                            class_2_medianDict[_class], codebook_df, 
                                            sel_codebook=sel_codebook, 
                                            use_cis=True,use_trans=False, sort_by_region=sort_by_region,
                                  )
    
    cis_distmap_dict[_class] = _cis_mat

In [14]:
import pickle
pickle.dump(cis_distmap_dict,open(os.path.join(output_analysis_folder,'raw_cis_distmap_subclass.pkl'),'wb'))

## get all chr-by-pair dict

In [15]:
loc_pair_idx_list_bychr = {}
for _chr_key in list(chr_2_orders.keys())[:]:
    # get loc name info
    loci_list = codebook_df.iloc[chr_2_indices[_chr_key]].index.tolist()
    # make loci idx mat from codebook
    codebook_idx_mat = np.empty([len(loci_list), len(loci_list)], dtype=object)
    for i in range(len(loci_list)):
        for j in range(len(loci_list)):
            codebook_idx_mat[i,j] = (i,j)
    # upper tri for the unique pair idx set        
    loc_pair_idx_list = codebook_idx_mat[np.triu_indices(len(codebook_idx_mat),0)] # 0 to include the trans-homolog allele 
    loc_pair_idx_list_bychr[_chr_key] = loc_pair_idx_list

loc_pair_idx_list_bychr['1']

array([(0, 0), (0, 1), (0, 2), ..., (151, 151), (151, 152), (152, 152)],
      dtype=object)

## summarize unnormalized distmap

In [17]:
# summarize all unique loci interactions across celltypes            
distmap_summary_dict = {'loc_1':[],'loc_2':[], 'pairwise_distance':[],'subclass':[], 'chr':[]}

for _class in class_2_medianDict.keys():
    _cis_mat = cis_distmap_dict[_class]
    # get cist mat for each chr
    for _chr_key in list(chr_2_orders.keys())[:]:
        _cis_mat_chr = _cis_mat[chr_2_indices[_chr_key],:]
        _cis_mat_chr = _cis_mat_chr[:,chr_2_indices[_chr_key]]
        _dists = _cis_mat_chr[np.triu_indices(len(_cis_mat_chr),0)] # 0 to include the trans-homolog allele
        distmap_summary_dict['pairwise_distance'].extend(_dists)
        distmap_summary_dict['subclass'].extend([_class,]*len(_dists))
        distmap_summary_dict['chr'].extend([_chr_key,]*len(_dists))
        # add loci name saved for each chr
        loci_list = codebook_df.iloc[chr_2_indices[_chr_key]].index.tolist()
        loc_pair_idx_list = loc_pair_idx_list_bychr[_chr_key] 
        distmap_summary_dict['loc_1'].extend(loci_list[idx[0]] for idx in loc_pair_idx_list)
        distmap_summary_dict['loc_2'].extend(loci_list[idx[1]] for idx in loc_pair_idx_list)
    
distmap_summary_df = pd.DataFrame(distmap_summary_dict)
distmap_summary_df.index.name='loci_pairs'
distmap_summary_df

Unnamed: 0_level_0,loc_1,loc_2,pairwise_distance,subclass,chr
loci_pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,chr1_3742742_3759944,chr1_3742742_3759944,0.000000,Oligo,1
1,chr1_3742742_3759944,chr1_6245958_6258969,0.922651,Oligo,1
2,chr1_3742742_3759944,chr1_8740008_8759916,1.070654,Oligo,1
3,chr1_3742742_3759944,chr1_9627926_9637875,1.060591,Oligo,1
4,chr1_3742742_3759944,chr1_9799472_9811359,1.093519,Oligo,1
...,...,...,...,...,...
2223475,chrX_167157164_167167452,chrX_169963295_170005197,0.845943,VLMC,X
2223476,chrX_168746045_168757590,chrX_168746045_168757590,0.000000,VLMC,X
2223477,chrX_168746045_168757590,chrX_169963295_170005197,0.706862,VLMC,X
2223478,chrX_169963295_170005197,chrX_169963295_170005197,0.000000,VLMC,X


## add genomic distance

In [18]:
for _chr_name, _chr_group in distmap_summary_df.groupby(by='chr'):
    _chr_group_temp = _chr_group.copy(deep=True)
    _chr_group_temp['loc_1_start'] = _chr_group_temp['loc_1'].map(lambda x: int(x.split('_')[1]))
    _chr_group_temp['loc_2_start'] = _chr_group_temp['loc_2'].map(lambda x: int(x.split('_')[1]))
    _chr_group_temp['genomic_distance'] = _chr_group_temp['loc_2_start']-_chr_group_temp['loc_1_start']
    distmap_summary_df.loc[distmap_summary_df['chr']==_chr_name, 'genomic_distance'] = _chr_group_temp['genomic_distance'] 

distmap_summary_df.loc[:, 'genomic_distance']= distmap_summary_df['genomic_distance'].map(lambda x: int(x))
distmap_summary_df

  distmap_summary_df.loc[:, 'genomic_distance']= distmap_summary_df['genomic_distance'].map(lambda x: int(x))


Unnamed: 0_level_0,loc_1,loc_2,pairwise_distance,subclass,chr,genomic_distance
loci_pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,chr1_3742742_3759944,chr1_3742742_3759944,0.000000,Oligo,1,0
1,chr1_3742742_3759944,chr1_6245958_6258969,0.922651,Oligo,1,2503216
2,chr1_3742742_3759944,chr1_8740008_8759916,1.070654,Oligo,1,4997266
3,chr1_3742742_3759944,chr1_9627926_9637875,1.060591,Oligo,1,5885184
4,chr1_3742742_3759944,chr1_9799472_9811359,1.093519,Oligo,1,6056730
...,...,...,...,...,...,...
2223475,chrX_167157164_167167452,chrX_169963295_170005197,0.845943,VLMC,X,2806131
2223476,chrX_168746045_168757590,chrX_168746045_168757590,0.000000,VLMC,X,0
2223477,chrX_168746045_168757590,chrX_169963295_170005197,0.706862,VLMC,X,1217250
2223478,chrX_169963295_170005197,chrX_169963295_170005197,0.000000,VLMC,X,0


## save

In [21]:
import tqdm
output_analysis_subfolder = os.path.join(output_analysis_folder,'Cis_dismap_summary')
if not os.path.exists(output_analysis_subfolder):
    os.mkdir(output_analysis_subfolder)
    print ('Create analysis output subfolder.')

for _class in tqdm.tqdm(class_2_medianDict.keys()):
    _class_distmap_summary_df = distmap_summary_df[distmap_summary_df['subclass']==_class]
    _class_distmap_summary_df.to_csv(os.path.join(output_analysis_subfolder,
                                                  f'raw_cis_distmap_subclass_{_class.replace("/","_").replace(" ","_")}_summary.csv'))

Create analysis output subfolder.


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:16<00:00,  1.29it/s]
