In [1]:
import os

import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt
import seaborn as sns

sc.settings.n_jobs = 24
sc.settings.set_figure_params(dpi=180, dpi_save=300, frameon=False, figsize=(4, 4), fontsize=8, facecolor='white')

In [2]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
print(ad.__version__)

0.8.0


In [3]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

import pandas as pd

35968


In [4]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

In [5]:
%matplotlib inline

sc.settings.set_figure_params(dpi=300, )

# 1. Load atac Data

In [6]:
# Get loaded adata from other notebook
import os
import scanpy as sc
# L drive is Crick Pu_SSD_0
scRNA_folder =r'L:\Shiwei\DNA_MERFISH_analysis\Paired_tag\anndata'
# load from here for saved h5ad
target_mode = 'H3K4me3'
adata = sc.read(os.path.join(scRNA_folder,f'FC_pairtag_{target_mode}.h5ad'))

In [7]:
adata_ori = adata
adata_ori.var

chr10_100002000_100003000
chr10_100004000_100005000
chr10_100007000_100008000
chr10_100008000_100009000
chr10_100009000_100010000
...
chrY_9989000_9990000
chrY_9990000_9991000
chrY_9997000_9998000
chrY_9998000_9999000
chrY_9999000_10000000


# 2. Load MERFISH codebook

In [8]:
# load codebook
# L drive is Crick Pu_SSD_0
analysis_save_folder=r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis_20230201'

# Load sorted codebook (allows direct matrix slicing) with cell type info
# Load codebook (then sort to allow direct matrix slicing) with cell type info

celltype_codebook_fname = os.path.join(analysis_save_folder,'merged_codebook.csv')
celltype_codebook_df = pd.read_csv (celltype_codebook_fname, index_col=0)

# sort df temporailiy so matrix can be sliced by df order directly
celltype_codebook_df = loci_1d_features.sort_loci_df_by_chr_order (celltype_codebook_df)

celltype_codebook_df[['name','chr','chr_order']].head()

Unnamed: 0,name,chr,chr_order
0,1:3742742-3759944,1,0.0
1,1:6245958-6258969,1,1.0
2,1:8740008-8759916,1,2.0
1016,1:9627926-9637875,1,3.0
1017,1:9799472-9811359,1,4.0


In [9]:
# keep relevant info and reset the format
import gene_to_loci as gl

celltype_codebook_df['loci_name']= celltype_codebook_df['name'].map(gl.loci_pos_format) 
celltype_codebook_df['loci_name']= celltype_codebook_df['loci_name'].map(lambda x: x[0])
celltype_codebook_df['chr_order']= celltype_codebook_df['chr_order'].map(lambda x: int(x))
celltype_codebook_df = celltype_codebook_df.set_index('loci_name')
celltype_codebook_df[['chr','chr_order']]

Unnamed: 0_level_0,chr,chr_order
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1
chr1_3742742_3759944,1,0
chr1_6245958_6258969,1,1
chr1_8740008_8759916,1,2
chr1_9627926_9637875,1,3
chr1_9799472_9811359,1,4
...,...,...
chrX_166247682_166259932,X,60
chrX_167157164_167167452,X,61
chrX_168746045_168757590,X,62
chrX_169963295_170005197,X,63


# 3. Load peaks for all MERFISH loci

In [10]:
import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

In [14]:
peak_names_list = adata_ori.var.index.tolist()


loci_peak_df = celltype_codebook_df[['name','id','chr','chr_order','library']]


# prepare and save this for future analyses
#for extend_dist in [0*1000,50*1000,100*1000,500*1000,1000*1000,2000*1000]:
for extend_dist in [2000*1000]:  
                    
    loci_peak_df = atac_to_loci.find_peaks_near_gene_dataframe (loci_peak_df, 
                                        peak_names_list, 
                                        sel_loci_col = None,
                                        extend_dist = extend_dist,
                                        peak_coverage_type = 'center',
                                        key_added = None)

100%|████████████████████████████████████████████████████████████████████████████| 1982/1982 [3:07:25<00:00,  5.67s/it]


In [15]:
loci_peak_df

Unnamed: 0_level_0,name,id,chr,chr_order,library,adjacent_peaks_2000kb_center
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0,CTP11,chr1_3000000_3001000; chr1_3003000_3004000; ch...
chr1_6245958_6258969,1:6245958-6258969,2,1,1,CTP11,chr1_4248000_4249000; chr1_4249000_4250000; ch...
chr1_8740008_8759916,1:8740008-8759916,3,1,2,CTP11,chr1_10001000_10002000; chr1_10002000_10003000...
chr1_9627926_9637875,1:9627926-9637875,1,1,3,CTP13,chr1_10001000_10002000; chr1_10002000_10003000...
chr1_9799472_9811359,1:9799472-9811359,2,1,4,CTP13,chr1_10001000_10002000; chr1_10002000_10003000...
...,...,...,...,...,...,...
chrX_166247682_166259932,X:166247682-166259932,1059,X,60,CTP11,chrX_164254000_164255000; chrX_164255000_16425...
chrX_167157164_167167452,X:167157164-167167452,990,X,61,CTP13,chrX_165169000_165170000; chrX_165178000_16517...
chrX_168746045_168757590,X:168746045-168757590,1060,X,62,CTP11,chrX_166746000_166747000; chrX_166748000_16674...
chrX_169963295_170005197,X:169963295-170005197,991,X,63,CTP13,chrX_167963000_167964000; chrX_167965000_16796...


In [16]:
output_folder = scRNA_folder

loci_peak_df.to_csv(os.path.join(output_folder, f'MERFISH_loci_adjacent_{target_mode}_center.csv'))

In [11]:
peak_names_list = adata_ori.var.index.tolist()


loci_peak_df = celltype_codebook_df[['name','id','chr','chr_order','library']]


# prepare and save this for future analyses
#for extend_dist in [0*1000,50*1000,100*1000,500*1000,1000*1000,2000*1000]:
for extend_dist in [0*1000,50*1000,500*1000,]:  
                    
    loci_peak_df = atac_to_loci.find_peaks_near_gene_dataframe (loci_peak_df, 
                                        peak_names_list, 
                                        sel_loci_col = None,
                                        extend_dist = extend_dist,
                                        peak_coverage_type = 'center',
                                        key_added = None)

100%|████████████████████████████████████████████████████████████████████████████| 1982/1982 [3:05:02<00:00,  5.60s/it]
100%|████████████████████████████████████████████████████████████████████████████| 1982/1982 [2:58:23<00:00,  5.40s/it]
100%|████████████████████████████████████████████████████████████████████████████| 1982/1982 [2:42:51<00:00,  4.93s/it]


In [12]:
loci_peak_df

Unnamed: 0_level_0,name,id,chr,chr_order,library,adjacent_peaks_0kb_center,adjacent_peaks_50kb_center,adjacent_peaks_500kb_center
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0,CTP11,chr1_3743000_3744000; chr1_3744000_3745000; ch...,chr1_3696000_3697000; chr1_3709000_3710000; ch...,chr1_3243000_3244000; chr1_3245000_3246000; ch...
chr1_6245958_6258969,1:6245958-6258969,2,1,1,CTP11,chr1_6246000_6247000; chr1_6248000_6249000; ch...,chr1_6197000_6198000; chr1_6198000_6199000; ch...,chr1_5746000_5747000; chr1_5747000_5748000; ch...
chr1_8740008_8759916,1:8740008-8759916,3,1,2,CTP11,chr1_8740000_8741000; chr1_8741000_8742000; ch...,chr1_8720000_8721000; chr1_8721000_8722000; ch...,chr1_8240000_8241000; chr1_8241000_8242000; ch...
chr1_9627926_9637875,1:9627926-9637875,1,1,3,CTP13,chr1_9628000_9629000; chr1_9629000_9630000; ch...,chr1_9578000_9579000; chr1_9579000_9580000; ch...,chr1_10001000_10002000; chr1_10002000_10003000...
chr1_9799472_9811359,1:9799472-9811359,2,1,4,CTP13,chr1_9799000_9800000; chr1_9800000_9801000; ch...,chr1_9752000_9753000; chr1_9753000_9754000; ch...,chr1_10001000_10002000; chr1_10002000_10003000...
...,...,...,...,...,...,...,...,...
chrX_166247682_166259932,X:166247682-166259932,1059,X,60,CTP11,chrX_166248000_166249000; chrX_166250000_16625...,chrX_166199000_166200000; chrX_166207000_16620...,chrX_165750000_165751000; chrX_165752000_16575...
chrX_167157164_167167452,X:167157164-167167452,990,X,61,CTP13,chrX_167157000_167158000; chrX_167158000_16715...,chrX_167107000_167108000; chrX_167108000_16710...,chrX_166659000_166660000; chrX_166660000_16666...
chrX_168746045_168757590,X:168746045-168757590,1060,X,62,CTP11,chrX_168748000_168749000,chrX_168696000_168697000; chrX_168701000_16870...,chrX_168246000_168247000; chrX_168248000_16824...
chrX_169963295_170005197,X:169963295-170005197,991,X,63,CTP13,chrX_169964000_169965000; chrX_169967000_16996...,chrX_169913000_169914000; chrX_169917000_16991...,chrX_169466000_169467000; chrX_169476000_16947...


In [13]:
output_folder = scRNA_folder

loci_peak_df.to_csv(os.path.join(output_folder, f'MERFISH_loci_adjacent_{target_mode}_center_other_res.csv'))