# 1. Load packages

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

or the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

In [1]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

import pandas as pd

35420


See **functions** in the repository for [AnalysisTool_Chromatin](../../functions/README.md)

In [2]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

In [3]:
import seaborn as sns
import scanpy as sc

## 1.1 Define output folder

In [4]:
output_main_folder = r'L:\Shiwei\Figures\MOp_draft_2023_v2\AB_compartment_th_zero'
output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')


Use existing analysis folder: L:\Shiwei\Figures\MOp_draft_2023_v2\AB_compartment_th_zero\analysis.
Use existing figure folder: L:\Shiwei\Figures\MOp_draft_2023_v2\AB_compartment_th_zero\figures.


## 1.2 setting up plotting parameters

In [5]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

In [6]:
# cell labels from RNA-MERFISH and celltype prediction
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]
# cell palette from RNA-MERFISH UMAP and stats
celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


In [7]:
# this is the plotting order noted based on the snRNA transcriptional acitivty;
# we'd use this for all the graphs in this notebook
# use the snRNA data below (# section 3.1.1) if needs to re-calculate
sorted_cellplot_order = ['Micro', 'Oligo', 'Endo', 'OPC', 'Astro', 'Vip', 'Lamp5',
                  'L5/6 NP', 'Sst', 'Sncg', 'Pvalb', 'L4/5 IT', 'L6 CT',
                  'L6 IT', 'L6b', 'L2/3 IT', 'L5 IT', 'L5 ET']

# 2. Load codebook

## Codebook

Example of [codebook](../../postanalysis/0_locus_annotation/resources/MERFISH_loci_adjacent_genes_tss_more_res.csv)

In [8]:
# load codebook
# L drive is Crick Pu_SSD_0
codebook_save_folder=r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin\resources'

# Load sorted codebook (allows direct matrix slicing) with cell type info
# Load codebook (then sort to allow direct matrix slicing) with cell type info

codebook_fname = os.path.join(codebook_save_folder,'MERFISH_loci_adjacent_genes_tss_more_res.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)

# sort df temporailiy so matrix can be sliced by df order directly
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library,dtype,adjacent_genes_0kb_tss,adjacent_genes_10kb_tss,adjacent_genes_50kb_tss,adjacent_genes_100kb_tss,adjacent_genes_200kb_tss,adjacent_genes_500kb_tss,adjacent_genes_1000kb_tss,adjacent_genes_1500kb_tss,adjacent_genes_2000kb_tss
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0,CTP11,combo,intergenic,intergenic,intergenic,Xkr4,Xkr4,Gm18956; Gm7341; Xkr4,Gm18956; Gm2053; Gm6085; Gm6101; Gm6119; Gm734...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...
chr1_6245958_6258969,1:6245958-6258969,2,1,1,CTP11,combo,intergenic,intergenic,4732440D04Rik; Rb1cc1,4732440D04Rik; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Npbwr1; Rb1cc1...,4732440D04Rik; Gm19026; Gm2147; Gm5694; Gm7182...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...
chr1_8740008_8759916,1:8740008-8759916,3,1,2,CTP11,combo,intergenic,intergenic,Gm15452,Gm15452,Gm15452,Gm15452,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm15452;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...
chr1_9627926_9637875,1:9627926-9637875,1,1,3,CTP13,combo,2610203C22Rik,2610203C22Rik,2610203C22Rik,2610203C22Rik; Adhfe1; Gm6161; Mybl1; Rrs1,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm18300;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Cops5; C...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...
chr1_9799472_9811359,1:9799472-9811359,2,1,4,CTP13,combo,Gm6195,Gm6195; Sgk3,Gm6195; Sgk3,1700034P13Rik; Gm6195; Mcmdc2; Mybl1; Sgk3; Vc...,1700034P13Rik; 2610203C22Rik; Gm6195; Mcmdc2; ...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...


## Refgen for gene dist matrix

Example of [refgen](../0_locus_annotation/resources/refgen_df_for_jie.csv)

In [9]:
codebook_folder = r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis'
df_refgen_savename = os.path.join(codebook_folder,'refgen_df_for_jie.csv')
df_refgen=pd.read_csv(df_refgen_savename,index_col=0)
df_refgen['chr'] = df_refgen['chr'].apply(lambda x: 'chr'+x)
df_refgen.head()

Unnamed: 0,name,id,chr,chr_order,library,dtype,loci_name,start,end,hyb,delta,region_id
0,1:3742742-3759944,1,chr1,0,CTP11,combo,chr1_3742742_3759944,3742742,3759944,0,0,1
1,1:6245958-6258969,2,chr1,1,CTP11,combo,chr1_6245958_6258969,6245958,6258969,1,2503216,2
2,1:8740008-8759916,3,chr1,2,CTP11,combo,chr1_8740008_8759916,8740008,8759916,2,4997266,3
3,1:9627926-9637875,1,chr1,3,CTP13,combo,chr1_9627926_9637875,9627926,9637875,3,5885184,4
4,1:9799472-9811359,2,chr1,4,CTP13,combo,chr1_9799472_9811359,9799472,9811359,4,6056730,5


In [10]:
chrom_number = []
for i, row in df_refgen.iterrows():
    if ('X' in row.chr):
        chrom_number.append(20)
    elif ('Y' in row.chr):
        chrom_number.append(21)
    else:
        chrom_number.append(int(row['chr'].split('chr')[-1]))
df_refgen['chrom_number'] = chrom_number
df_refgen.sort_values(['chrom_number','hyb'], inplace=True, ignore_index=True)
df_refgen.head()

Unnamed: 0,name,id,chr,chr_order,library,dtype,loci_name,start,end,hyb,delta,region_id,chrom_number
0,1:3742742-3759944,1,chr1,0,CTP11,combo,chr1_3742742_3759944,3742742,3759944,0,0,1,1
1,1:6245958-6258969,2,chr1,1,CTP11,combo,chr1_6245958_6258969,6245958,6258969,1,2503216,2,1
2,1:8740008-8759916,3,chr1,2,CTP11,combo,chr1_8740008_8759916,8740008,8759916,2,4997266,3,1
3,1:9627926-9637875,1,chr1,3,CTP13,combo,chr1_9627926_9637875,9627926,9637875,3,5885184,4,1
4,1:9799472-9811359,2,chr1,4,CTP13,combo,chr1_9799472_9811359,9799472,9811359,4,6056730,5,1


In [11]:
from scipy.spatial.distance import squareform, pdist
chr_gene_dist_matrices = {}
for chrom, df_chr in df_refgen.groupby('chr'):
    if 'Y' not in chrom:
        _df = df_chr.sort_values('hyb')
        _delta_values= np.zeros((len(_df),2))
        _delta_values[:,0] = _df.delta.values
        _gene_dist = squareform(pdist(_delta_values))
        chr_gene_dist_matrices[chrom] = _gene_dist

# 3. Load AB assignment



Data be can generated from notebook as:

[2_ab_compartment_assignment_threshold_zero](2_ab_compartment_assignment_threshold_zero.ipynb)


Example of data is in the repository as [AB_assignment_shared_ABth_zero](../4_compartment_analysis/resources/AB_assignment_shared_ABth_zero.csv)

In [12]:
AB_summary_fname = os.path.join(output_analysis_folder,'AB_assignment_shared_ABth_zero.csv')
AB_summary_df = pd.read_csv(AB_summary_fname, index_col=0)

In [14]:
AB_summary_df.head()

Unnamed: 0_level_0,Astro,Endo,GABA,Gluta,Micro,Oligo
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
chr1_3742742_3759944,B,B,B,B,B,B
chr1_6245958_6258969,A,B,B,B,B,B
chr1_8740008_8759916,B,B,B,B,B,B
chr1_9627926_9637875,A,B,B,B,A,B
chr1_9799472_9811359,A,B,B,B,A,A


# 4. Generate AB vec

In [13]:
sort_by_region = False
from ImageAnalysis3.structure_tools import distance
chr_2_indices, chr_2_orders = distance.Generate_PlotOrder(codebook_df, codebook_df, sort_by_region=sort_by_region)


celltype_chr_2_AB_dict = {}
for _group in AB_summary_df.columns:
    
    celltype_AB_dict = {}
    celltype_AB_dict['A'] = np.where(np.array(AB_summary_df[_group].tolist())=='A')[0]
    celltype_AB_dict['B'] = np.where(np.array(AB_summary_df[_group].tolist())=='B')[0]
    # by chr
    celltype_chr_2_AB = {}
    for _chr in chr_2_indices:
        _chr_AB = {
            'A':np.array([list(chr_2_indices[_chr]).index(_ind) 
                          for _ind in np.intersect1d(chr_2_indices[_chr], celltype_AB_dict['A'])], dtype=np.int32),
            'B':np.array([list(chr_2_indices[_chr]).index(_ind) 
                          for _ind in np.intersect1d(chr_2_indices[_chr], celltype_AB_dict['B'])], dtype=np.int32),
        }
        celltype_chr_2_AB[_chr] = _chr_AB
    
    # append dict
    celltype_chr_2_AB_dict[_group] = celltype_chr_2_AB

# 5. Calculate AB (or transcription activity) score and density ratio

## Major class level

Data can be generated from using the notebook

[preprocess/2_dna_merfish/scripts/2_spot_pick/4_summarize_jie_to_dict](../../preprocess/2_dna_merfish/scripts/2_spot_pick/4_summarize_jie_to_dict.ipynb)

In [15]:
postanalysis_folder = r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_20230201\MOp_WT_postanalysis\postanalysis_vCW2_sorted'

class_2_chr2Zxys_filename = os.path.join(postanalysis_folder, 'class_2_chr2Zxys.pkl')
class_2_chr2ZxysList = pickle.load(open(class_2_chr2Zxys_filename, 'rb'))

In [16]:
from tqdm import tqdm
from ImageAnalysis3.compartment_tools.density import BatchCompartmentDensities
reload(ia.compartment_tools.density)

<module 'ImageAnalysis3.compartment_tools.density' from 'C:\\Users\\shiwei\\Documents\\ImageAnalysis3\\compartment_tools\\density.py'>

In [17]:
reload(ia.compartment_tools.density)

<module 'ImageAnalysis3.compartment_tools.density' from 'C:\\Users\\shiwei\\Documents\\ImageAnalysis3\\compartment_tools\\density.py'>

In [18]:
class_2_chr2ZxysList.keys()

dict_keys(['Gluta', 'GABA', 'Astro', 'Endo', 'Micro', 'Oligo'])

### AB or transcription acitvity score

In [19]:
gaussian_radius = 0.5 # um
normalize_by_reg_num = False
num_threads=15

class_2_transScoreDicts = {}
class_2_cisScoreDicts = {}
for _sel_class in class_2_chr2ZxysList.keys():
#for _sel_class in ['Oligo','L2/3 IT']:

    celltype_chr_2_AB = celltype_chr_2_AB_dict[_sel_class]
    
    print(_sel_class, end =' ')
    _class_start = time.time()
    class_2_transScoreDicts[_sel_class] = BatchCompartmentDensities(
        class_2_chr2ZxysList[_sel_class], celltype_chr_2_AB, 
        gaussian_radius, num_threads=num_threads,
        normalize_by_reg_num=normalize_by_reg_num,)
    #subclass_2_cisScoreDicts[_sel_class] = BatchCompartmentDensities(
        #subclass_2_chr2ZxysList[_sel_class], ensemble_chr_2_AB, 
        #gaussian_radius, num_threads=num_threads,
        #normalize_by_reg_num=normalize_by_reg_num,
        #use_cis=True, use_trans=False)
    #
    print(f"in {time.time()-_class_start:.2f}s. ")

Gluta in 667.05s. 
GABA in 102.33s. 
Astro in 89.83s. 
Endo in 69.13s. 
Micro in 56.56s. 
Oligo in 272.83s. 


In [20]:
# save result
transABscores_ensemble_filename = os.path.join(output_analysis_folder, f'trans_AB_scores_notNorm_r{gaussian_radius}_byclass_th_zero.pkl')
if not os.path.exists(transABscores_ensemble_filename):
    print(transABscores_ensemble_filename)
    pickle.dump(class_2_transScoreDicts, open(transABscores_ensemble_filename, 'wb'))

L:\Shiwei\Figures\MOp_draft_2023_v2\AB_compartment_th_zero\analysis\trans_AB_scores_notNorm_r0.5_byclass_th_zero.pkl


### AB or transcription acitvity ratio

In [21]:
# convert into dict
class_2_transABRatioDicts = {}
# loop
for _class, _scoreDicts in class_2_transScoreDicts.items():
    print(_class, len(_scoreDicts))
    
    _ABratioDicts = [] 
    for _scoreDict in _scoreDicts:
        _chr_2_abratios = {}
        for _chr, _chr_AB_dict in _scoreDict.items():
            _AB_ratios = np.log(_chr_AB_dict['A']) - np.log(_chr_AB_dict['B'])
            _chr_2_abratios[_chr] = _AB_ratios
        # append
        _ABratioDicts.append(_chr_2_abratios)
    # assign
    class_2_transABRatioDicts[_class] = _ABratioDicts

Gluta 22173
GABA 3483


  _AB_ratios = np.log(_chr_AB_dict['A']) - np.log(_chr_AB_dict['B'])
  _AB_ratios = np.log(_chr_AB_dict['A']) - np.log(_chr_AB_dict['B'])


Astro 4970
Endo 3738
Micro 1836
Oligo 8057


In [22]:
transABratio_ensemble_filename = os.path.join(output_analysis_folder, f'trans_AB_ratio_notNorm_r{gaussian_radius}_byclass_th_zero.pkl')
if not os.path.exists(transABratio_ensemble_filename):
    print(transABratio_ensemble_filename)
    pickle.dump(class_2_transABRatioDicts, open(transABratio_ensemble_filename, 'wb'))

L:\Shiwei\Figures\MOp_draft_2023_v2\AB_compartment_th_zero\analysis\trans_AB_ratio_notNorm_r0.5_byclass_th_zero.pkl
