# Import module

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

Or from the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

## ImageAnalysis3 and basic modules

In [1]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
from ImageAnalysis3 import *
from ImageAnalysis3.classes import _allowed_kwds

import h5py
import ast
import pandas as pd

print(os.getpid())

22196


## Chromatin_analysis_tools etc

See **functions** in the repository for [AnalysisTool_Chromatin](../../README.md)

In [2]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

# Define folders

In [3]:
# main folder for postanalysis
postanalysis_folder = r'L:\Shiwei\postanalysis_2024\v0'
# input files for postanalysis
input_folder = os.path.join(postanalysis_folder, 'resources_from_preprocess')

# output file to be generated
output_main_folder = os.path.join(postanalysis_folder, 'higher_order_chromosome')

output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

# make new folder if needed
make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')

Use existing analysis folder: L:\Shiwei\postanalysis_2024\v0\higher_order_chromosome\analysis.
Use existing figure folder: L:\Shiwei\postanalysis_2024\v0\higher_order_chromosome\figures.


# Plotting parameters

In [4]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

In [5]:
# Other required plotting parameters
_dpi = 300
_font_size = 7
_page_width = 5.5


## cell type color-codes

In [6]:
# cell labels from RNA-MERFISH and celltype prediction
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]
# cell palette from RNA-MERFISH UMAP and stats
celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


In [7]:
# this is the plotting order noted based on the snRNA transcriptional acitivty if needed
sorted_cellplot_order_byRNA = ['Micro', 'Oligo', 'Endo', 'OPC', 'Astro', 'Vip', 'Lamp5',
                  'L5/6 NP', 'Sst', 'Sncg', 'Pvalb', 'L4/5 IT', 'L6 CT',
                  'L6 IT', 'L6b', 'L2/3 IT', 'L5 IT', 'L5 ET']

# Load data relevant information

## load and format codebook

[merged codebook](../resources/merged_codebook.csv) as in the repository (merged for all DNA-MERFISH libraries)

In [8]:
# Load codebook 
codebook_fname = os.path.join(input_folder,'merged_codebook.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)

# sort df by chr and chr_order
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)
codebook_df.head()

Unnamed: 0,name,id,NDB_784,NDB_755,NDB_826,NDB_713,NDB_865,NDB_725,NDB_817,NDB_710,...,NDB_479,NDB_562,NDB_608,NDB_460,NDB_563,NDB_592,NDB_368,NDB_436,NDB_629,NDB_604
0,1:3742742-3759944,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1:6245958-6258969,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1:8740008-8759916,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,1:9627926-9637875,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1017,1:9799472-9811359,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
# Format the chr loci name by 
# 1. changing loci name format
# 2. extract relevant information such as id, chr, chr_order, and library etc
from gene_to_loci import loci_pos_format
loci_name_list = list(map(loci_pos_format, codebook_df['name'].tolist()))
loci_name_arr = np.array(loci_name_list)

# convert to a new dataframe and set loci name as index
codebook_df = codebook_df[['name','id','chr','chr_order','library']]
codebook_df['loci_name'] = list(loci_name_arr[:,0])
codebook_df = codebook_df.set_index ('loci_name')

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13


# Load CpG density for correlation

Example of CpG information

[external/resources/refgen_with_loci_index_CG_Density](../../external/resources/refgen_with_loci_index_CG_Density.csv)

In [10]:
CG_fname = os.path.join(output_analysis_folder,'refgen_with_loci_index_CG_Density.csv')
MOp_CG = pd.read_csv(CG_fname)
MOp_CG

Unnamed: 0,name,id,chr,chr_order,library,dtype,loci_name,start,end,hyb,delta,loci_index,CG_density_100kb,CG_density_1Mb
0,1:3742742-3759944,1,chr1,0,CTP11,combo,chr1_3742742_3759944,3742742,3759944,0,0,0,453,4624
1,1:6245958-6258969,2,chr1,1,CTP11,combo,chr1_6245958_6258969,6245958,6258969,1,2503216,1,725,5382
2,1:8740008-8759916,3,chr1,2,CTP11,combo,chr1_8740008_8759916,8740008,8759916,2,4997266,2,533,4294
3,1:9627926-9637875,1,chr1,3,CTP13,combo,chr1_9627926_9637875,9627926,9637875,3,5885184,3,683,7266
4,1:9799472-9811359,2,chr1,4,CTP13,combo,chr1_9799472_9811359,9799472,9811359,4,6056730,4,1008,7658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912,9:118751319-118759956,559,chr9,97,CTP11,combo,chr9_118751319_118759956,118751319,118759956,97,115000180,1074,1155,13802
1913,9:119425689-119434008,544,chr9,98,CTP13,combo,chr9_119425689_119434008,119425689,119434008,98,115674550,1075,1904,14885
1914,9:121252615-121259973,560,chr9,99,CTP11,combo,chr9_121252615_121259973,121252615,121259973,99,117501476,1076,1242,12677
1915,9:123457587-123467505,545,chr9,100,CTP13,combo,chr9_123457587_123467505,123457587,123467505,100,119706448,1077,803,7377


# Load PCA results

Data below can be generated from notebook:

[3_1_ab_compartment_and_pca_analysis_pm_majorType](3_1_ab_compartment_and_pca_analysis_pm_majorType.ipynb)

In [11]:
import pickle


dict_pca_fname = os.path.join(output_analysis_folder,f'majorType_pca_analysis_result.pkl')
dict_pca_result = pickle.load(open(dict_pca_fname,'rb'))

In [12]:
dict_pca_result[('Astro','chr1')].keys()

dict_keys(['celltype', 'chrom', 'norm_pc1', 'norm_pc2', 'norm_pc3', 'pca_explained_variance_ratio', 'correlation_map'])

# Determine A/B compartment by correlation to CpG density

In [13]:
# Load class_2_median to get cell type names
# class to median distance dict
class_2_median_filename = os.path.join(input_folder, 'class_2_medianDict.pkl')
class_2_medianDict = pickle.load(open(class_2_median_filename, 'rb'))
print(class_2_medianDict.keys())

dict_keys(['Gluta', 'GABA', 'Astro', 'Endo', 'Micro', 'Oligo'])


## convert results to dataframe and determine A/B

In [14]:
from sklearn.linear_model import LinearRegression
from scipy import stats


# init dict to save results
cell_explained_dict = {'celltype':[], 'chr':[], 'PC':[], 
                       'PC_values':[], 
                     'explained_variance_ratio':[],
                       'PC_correlation_to_CG':[],
                       'is_A/B_compartment_CG':[], 
                        }

# process for chr1-chr19
sorted_chroms = [f'chr{_c}' for _c in np.arange(1,20)]


for _celltype in class_2_medianDict.keys():
    
    for _chr in sorted_chroms:
        
        explained_res = dict_pca_result[(_celltype, _chr)]['pca_explained_variance_ratio']
        cell_explained_dict['celltype'].extend([_celltype]*3)
        cell_explained_dict['chr'].extend([_chr]*3)
        cell_explained_dict['PC'].extend([1,2,3])
        cell_explained_dict['explained_variance_ratio'].extend(explained_res)
        
        group_CG_chr = np.array(MOp_CG[MOp_CG['chr']==_chr]['CG_density_1Mb'].tolist())

        _corr_list_CG = []
        _assign_flags_CG = np.array([0,0,0])
        
        for _pc in ['norm_pc1','norm_pc2','norm_pc3']:
            group_pc_chr = dict_pca_result[(_celltype, _chr)][_pc]
            res_CG = stats.spearmanr(group_pc_chr,group_CG_chr)
            cell_explained_dict['PC_correlation_to_CG'].append(abs(round(res_CG[0],2)))
            _corr_list_CG.append(abs(round(res_CG[0],2)))
            
            if res_CG[0]>0:
                cell_explained_dict['PC_values'].append(group_pc_chr)
            else:
                cell_explained_dict['PC_values'].append(-group_pc_chr)

        _assign_flags_CG[np.argwhere(_corr_list_CG == np.max(_corr_list_CG))[0]]=1
        cell_explained_dict['is_A/B_compartment_CG'].extend(_assign_flags_CG)


cell_explained_df = pd.DataFrame(cell_explained_dict)    
cell_explained_df

Unnamed: 0,celltype,chr,PC,PC_values,explained_variance_ratio,PC_correlation_to_CG,is_A/B_compartment_CG
0,Gluta,chr1,1,"[-1.0720360862008338, -1.0057865000517257, -1....",0.790180,0.74,1
1,Gluta,chr1,2,"[0.8664122176491322, 0.40703518401269984, 0.61...",0.076719,0.01,0
2,Gluta,chr1,3,"[2.0233911831892715, 1.9016959118389605, 2.048...",0.056340,0.01,0
3,Gluta,chr2,1,"[0.13619794354487083, 0.4659224022060824, 0.47...",0.811000,0.73,1
4,Gluta,chr2,2,"[-1.6385857064731022, -1.9739079303383287, -2....",0.103980,0.08,0
...,...,...,...,...,...,...,...
337,Oligo,chr18,2,"[-1.6925514727373618, 0.28228300366875136, -1....",0.166048,0.67,1
338,Oligo,chr18,3,"[-0.8323201085715951, 0.26046478120384103, 0.0...",0.062434,0.13,0
339,Oligo,chr19,1,"[-1.1040887655234952, -1.12223034191823, -0.85...",0.670852,0.40,0
340,Oligo,chr19,2,"[1.0947162166179643, 1.2374328461304978, 1.600...",0.228201,0.73,1


## extract the PC values corresponding to A/B 

In [15]:
sel_PC_df = cell_explained_df[cell_explained_df['is_A/B_compartment_CG']==1]

sorted_chroms = [f'chr{_c}' for _c in np.arange(1,20)]
sorted_chroms.append('chrX')

# init dict to save PCs corresponding to A/B
group_ABs_list_dict = {}

for _celltype in np.unique(sel_PC_df['celltype']):
    group_ABs_list_dict[_celltype] = []
    group_PC_df = sel_PC_df[sel_PC_df['celltype']==_celltype]
    for _chr in sorted_chroms:
        if _chr =='chrX':
            group_ABs_list_dict[_celltype].extend([np.nan]*len(codebook_df[codebook_df['chr']=='X']))
        else:
            chr_PC_df = group_PC_df[group_PC_df['chr']==f'{_chr}']       
            group_ABs_list_dict[_celltype].extend(list(chr_PC_df['PC_values'].values[0]))


## adjust PC cutoff to equalize number of A/B across cell types

In [16]:
# init dict to save A/B binary identities
group_ABs_id_dict = {}

for i, group_i in enumerate(group_ABs_list_dict.keys()):

    AB_group = np.array(group_ABs_list_dict[group_i])
    AB_ids = np.array(['unassigned'] * len(AB_group))
    
    # do not adjust AB th for each cell type to have equal number of A/B
    #sel_AB_th = get_AB_th_for_equalAB(AB_group)
    sel_AB_th = 0
    
    AB_ids[AB_group>sel_AB_th] = 'A'
    AB_ids[AB_group<=sel_AB_th] = 'B'  # one more B than A
    AB_ids[np.isnan(AB_group)] = 'unassigned'
    group_ABs_id_dict[group_i] = AB_ids
    
group_ABs_id_df = pd.DataFrame(group_ABs_id_dict, index =codebook_df.index[:-1])
group_ABs_id_df

Unnamed: 0_level_0,Astro,Endo,GABA,Gluta,Micro,Oligo
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
chr1_3742742_3759944,B,A,B,B,B,B
chr1_6245958_6258969,A,A,B,B,B,B
chr1_8740008_8759916,B,B,B,B,B,B
chr1_9627926_9637875,A,A,B,B,A,B
chr1_9799472_9811359,A,A,B,B,A,A
...,...,...,...,...,...,...
chrX_163750534_163758706,unassigned,unassigned,unassigned,unassigned,unassigned,unassigned
chrX_166247682_166259932,unassigned,unassigned,unassigned,unassigned,unassigned,unassigned
chrX_167157164_167167452,unassigned,unassigned,unassigned,unassigned,unassigned,unassigned
chrX_168746045_168757590,unassigned,unassigned,unassigned,unassigned,unassigned,unassigned


In [17]:
res_summary_dict = {}
for _group, _group_AB_id in group_ABs_id_dict.items():
    res_summary_dict[_group]={'A': np.sum(_group_AB_id=='A'),'B': np.sum(_group_AB_id=='B')}
res_summary_df = pd.DataFrame(res_summary_dict).transpose()
res_summary_df['celltype'] = res_summary_df.index
res_summary_df['total'] = res_summary_df['A']+res_summary_df['B']
res_summary_df.head(10)

Unnamed: 0,A,B,celltype,total
Astro,920,997,Astro,1917
Endo,1041,876,Endo,1917
GABA,864,1053,GABA,1917
Gluta,858,1059,Gluta,1917
Micro,897,1020,Micro,1917
Oligo,898,1019,Oligo,1917


## save

In [18]:
group_ABs_id_df.to_csv(os.path.join(output_analysis_folder,'AB_assignment_CG_by_pm_majorType.csv'))