# 1. Load packages

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

or the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

In [7]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

import pandas as pd

43908


See **functions** in the repository for [AnalysisTool_Chromatin](../../README.md)

In [2]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

In [3]:
import seaborn as sns
import scanpy as sc

## 1.1 Define output folder

In [4]:
output_main_folder = r'L:\Shiwei\Figures\MOp_draft_2023_v1\Nuclear_organization\analysis'
output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')


Generating analysis folder: L:\Shiwei\Figures\MOp_draft_2023_v1\Nuclear_organization\analysis\analysis.
Generating figure folder: L:\Shiwei\Figures\MOp_draft_2023_v1\Nuclear_organization\analysis\figures.


## 1.2 setting up plotting parameters

In [8]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

In [9]:
# cell labels from RNA-MERFISH and celltype prediction
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]
# cell palette from RNA-MERFISH UMAP and stats
celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


In [10]:
# this is the plotting order noted based on the snRNA transcriptional acitivty;
# we'd use this for all the graphs in this notebook
# use the snRNA data below (# section 3.1.1) if needs to re-calculate
sorted_cellplot_order = ['Micro', 'Oligo', 'Endo', 'OPC', 'Astro', 'Vip', 'Lamp5',
                  'L5/6 NP', 'Sst', 'Sncg', 'Pvalb', 'L4/5 IT', 'L6 CT',
                  'L6 IT', 'L6b', 'L2/3 IT', 'L5 IT', 'L5 ET']

# 2. Load codebook

## Codebook

**Codebook** example from [codebook_link](../resources/MERFISH_loci_adjacent_peaks_center.csv)

In [11]:
# load codebook
# L drive is Crick Pu_SSD_0
codebook_save_folder=r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_20230201\postanalysis_vCW2_sorted'

# Load sorted codebook (allows direct matrix slicing) with cell type info
# Load codebook (then sort to allow direct matrix slicing) with cell type info

codebook_fname = os.path.join(codebook_save_folder,'MERFISH_loci_adjacent_genes_tss_more_res.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)

# sort df temporailiy so matrix can be sliced by df order directly
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library,dtype,adjacent_genes_0kb_tss,adjacent_genes_10kb_tss,adjacent_genes_50kb_tss,adjacent_genes_100kb_tss,adjacent_genes_200kb_tss,adjacent_genes_500kb_tss,adjacent_genes_1000kb_tss,adjacent_genes_1500kb_tss,adjacent_genes_2000kb_tss
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0,CTP11,combo,intergenic,intergenic,intergenic,Xkr4,Xkr4,Gm18956; Gm7341; Xkr4,Gm18956; Gm2053; Gm6085; Gm6101; Gm6119; Gm734...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...
chr1_6245958_6258969,1:6245958-6258969,2,1,1,CTP11,combo,intergenic,intergenic,4732440D04Rik; Rb1cc1,4732440D04Rik; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Npbwr1; Rb1cc1...,4732440D04Rik; Gm19026; Gm2147; Gm5694; Gm7182...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...
chr1_8740008_8759916,1:8740008-8759916,3,1,2,CTP11,combo,intergenic,intergenic,Gm15452,Gm15452,Gm15452,Gm15452,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm15452;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...
chr1_9627926_9637875,1:9627926-9637875,1,1,3,CTP13,combo,2610203C22Rik,2610203C22Rik,2610203C22Rik,2610203C22Rik; Adhfe1; Gm6161; Mybl1; Rrs1,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm18300;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Cops5; C...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...
chr1_9799472_9811359,1:9799472-9811359,2,1,4,CTP13,combo,Gm6195,Gm6195; Sgk3,Gm6195; Sgk3,1700034P13Rik; Gm6195; Mcmdc2; Mybl1; Sgk3; Vc...,1700034P13Rik; 2610203C22Rik; Gm6195; Mcmdc2; ...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...


# 3. Initiate save main dict

In [19]:
sel_class_list = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT',
                  'L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb', 'CGE',#'Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo',#'VLMC','SMC','Peri', 
                           #'other',
                          ]

ref_measure_dict_byGroup = {k:pd.DataFrame(codebook_df.index) for k in sel_class_list}

In [20]:
ref_measure_dict_byGroup['L2/3 IT']

Unnamed: 0,loci_name
0,chr1_3742742_3759944
1,chr1_6245958_6258969
2,chr1_8740008_8759916
3,chr1_9627926_9637875
4,chr1_9799472_9811359
...,...
1977,chrX_166247682_166259932
1978,chrX_167157164_167167452
1979,chrX_168746045_168757590
1980,chrX_169963295_170005197


# 4. Add pairtag data average for different modality

Data of different marks can be generated using the codebook as:

[external/scripts/pair_tag/2_adjacent_h3k27ac_peak_annotation_for_merfish_loci](../../../external/scripts/pair_tag/2_adjacent_h3k27ac_peak_annotation_for_merfish_loci.ipynb)

In [25]:
import tqdm

rna_folder = r'F:\Chromatin_v3r_result\Gene_expression\Pairtag\10x_subclass'
#rna_folder = r'F:\Chromatin_v3r_result\Gene_expression\10x_snRNA\10x_class'

target_mode_list = ['H3K9me3','H3K27me3','H3K27ac','H3K4me3']

#bin_size = 2000
for bin_size in  [0,50, 500, 2000]
    for target_mode in target_mode_list[:]:
        for _class in tqdm.tqdm(sel_class_list):
        #for _class in tqdm.tqdm(['Astro','Oligo','GLuta']):
            _class_name = _class.replace(' ','_').replace('/','_')
            #print(_class_name)
            rna_fname = os.path.join(rna_folder,f'MERFISH_loci_{target_mode}_2X_{bin_size}kb_for_{_class_name}.csv')
            if os.path.exists(rna_fname):
                rna_df = pd.read_csv(rna_fname,index_col=0)
                #rna_df = pd.read_csv(os.path.join(rna_folder,r'Astro_genome_Adjacent_1Mb.csv'),index_col=0)
                rna_df['mean_rna'] = rna_df.mean(axis=1)
                ref_measure_dict_byGroup[_class][f'mean_{target_mode}_2X_{bin_size}kb'] = rna_df['mean_rna'].tolist()

100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  7.94it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 14.18it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  7.10it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 19.70it/s]


# 6. Save the celltype means

In [26]:
ref_measure_dict_byGroup[_class]

Unnamed: 0,loci_name,mean_H3K9me3_2X_2000kb,mean_H3K27me3_2X_2000kb,mean_H3K27ac_2X_2000kb,mean_H3K4me3_2X_2000kb
0,chr1_3742742_3759944,1.964467,0.243902,2.309735,0.225806
1,chr1_6245958_6258969,3.558376,0.536585,3.309735,0.322581
2,chr1_8740008_8759916,4.177665,0.560976,1.840708,0.322581
3,chr1_9627926_9637875,3.771574,0.487805,2.230088,0.483871
4,chr1_9799472_9811359,3.578680,0.487805,2.194690,0.483871
...,...,...,...,...,...
1977,chrX_166247682_166259932,1.269036,0.292683,2.309735,0.225806
1978,chrX_167157164_167167452,1.355330,0.439024,1.938053,0.354839
1979,chrX_168746045_168757590,0.715736,0.341463,1.902655,0.258065
1980,chrX_169963295_170005197,0.492386,0.268293,0.743363,0.193548


In [27]:
for _class, _class_df in ref_measure_dict_byGroup.items():
    _class = _class.replace(' ','_').replace('/','_')
    _df_savefname = os.path.join(output_analysis_folder,f'subclass_{_class}_pairtag_mean_by_loci_more_res.csv')
    _class_df.to_csv(_df_savefname)

In [24]:
_df_savefname

'L:\\Shiwei\\Figures\\MOp_draft_2023_v1\\Nuclear_organization\\analysis\\analysis\\subclass_Endo_pairtag_mean_by_loci.csv'