# Import module

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

Or from the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

## ImageAnalysis3 and basic modules

In [1]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
from ImageAnalysis3 import *
from ImageAnalysis3.classes import _allowed_kwds

import h5py
import ast
import pandas as pd

print(os.getpid())

11840


## Chromatin_analysis_tools etc

See **functions** in the repository for [AnalysisTool_Chromatin](../../README.md)

In [2]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

# Define folders

In [3]:
# main folder for postanalysis
postanalysis_folder = r'L:\Shiwei\postanalysis_2024\v0'
# input files for postanalysis
input_folder = os.path.join(postanalysis_folder, 'resources_from_preprocess')

# output file to be generated
output_main_folder = os.path.join(postanalysis_folder, 'locus_annotation')

output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

# make new folder if needed
make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')

Use existing analysis folder: L:\Shiwei\postanalysis_2024\v0\locus_annotation\analysis.
Use existing figure folder: L:\Shiwei\postanalysis_2024\v0\locus_annotation\figures.


# Plotting parameters

In [4]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

# Load data relevant information

## load codebook with gene annotation

annotated codebook can be generated using the notebook below:


[0_locus_annotation/scripts/1_adjacent_gene_annotation_for_merfish_loci](../../0_locus_annotation/scripts/1_adjacent_gene_annotation_for_merfish_loci.ipynb)

In [6]:
# load codebook
codebook_folder = output_analysis_folder

# Load codebook and sort
codebook_fname = os.path.join(codebook_folder,'MERFISH_loci_adjacent_genes_tss_more_res.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library,adjacent_genes_10kb_tss,adjacent_genes_50kb_tss,adjacent_genes_100kb_tss,adjacent_genes_200kb_tss,adjacent_genes_500kb_tss,adjacent_genes_1000kb_tss,adjacent_genes_1500kb_tss,adjacent_genes_2000kb_tss
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11,intergenic,intergenic,Xkr4,Xkr4,Gm18956; Gm7341; Xkr4,Gm18956; Gm2053; Gm6085; Gm6101; Gm6119; Gm734...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11,intergenic,4732440D04Rik; Rb1cc1,4732440D04Rik; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Npbwr1; Rb1cc1...,4732440D04Rik; Gm19026; Gm2147; Gm5694; Gm7182...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11,intergenic,Gm15452,Gm15452,Gm15452,Gm15452,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm15452;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13,2610203C22Rik,2610203C22Rik,2610203C22Rik; Adhfe1; Gm6161; Mybl1; Rrs1,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm18300;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Cops5; C...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13,Gm6195; Sgk3,Gm6195; Sgk3,1700034P13Rik; Gm6195; Mcmdc2; Mybl1; Sgk3; Vc...,1700034P13Rik; 2610203C22Rik; Gm6195; Mcmdc2; ...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...


## load analyzed RNAseq annData

anndata can be generated from the original source using the notebook (after transfer the cell labels to filtered data):

[external/scripts/sn_rna/2_prepare_and_rename_sn_rna_mop](../../../external/scripts/sn_rna/2_prepare_and_rename_sn_rna_mop.ipynb)

In [7]:
# Get loaded adata from other notebook
import os
import scanpy as sc
# L drive is Crick Pu_SSD_0
scRNA_folder =r'L:\Shiwei\DNA_MERFISH_analysis\10x_nuclei_v3_MOp_AIBS\Analysis_10X_nuclei_v3_AIBS\processed_2024'
# load from here for saved h5ad
adata = sc.read(os.path.join(scRNA_folder,r'MOp_10x_sn_labeled.h5ad'))

In [11]:
adata_ori = adata.raw.to_adata()

In [13]:
adata_ori.X[:5,:5].toarray()

array([[42.,  9.,  0.,  0.,  0.],
       [14.,  0.,  0.,  0.,  0.],
       [14.,  1.,  0.,  0.,  0.],
       [10.,  0.,  0.,  0.,  0.],
       [12.,  0.,  0.,  0.,  0.]], dtype=float32)

# Extract gene expression from RNA seq

In [14]:
# change index name as imaged loci to find nearby genes
import gene_to_loci as gl

imaged_loci_df = codebook_df.copy(deep=True)

imaged_loci_df.index.name = 'Imaged_loci'
imaged_loci_df = gl.direct_get_genes_near_gene_dataframe (imaged_loci_df,
                                   codebook_df, 
                                   adjacent_gene_col = None)


imaged_loci_df

Get all existing adjacent gene columns.


Unnamed: 0_level_0,name,id,chr,chr_order,library,adjacent_genes_10kb_tss,adjacent_genes_50kb_tss,adjacent_genes_100kb_tss,adjacent_genes_200kb_tss,adjacent_genes_500kb_tss,...,adjacent_genes_1500kb_tss,adjacent_genes_2000kb_tss,Adjacent_genes_10kb_tss,Adjacent_genes_50kb_tss,Adjacent_genes_100kb_tss,Adjacent_genes_200kb_tss,Adjacent_genes_500kb_tss,Adjacent_genes_1000kb_tss,Adjacent_genes_1500kb_tss,Adjacent_genes_2000kb_tss
Imaged_loci,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11,intergenic,intergenic,Xkr4,Xkr4,Gm18956; Gm7341; Xkr4,...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...,intergenic,intergenic,Xkr4,Xkr4,Gm18956; Gm7341; Xkr4,Gm18956; Gm2053; Gm6085; Gm6101; Gm6119; Gm734...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...,Atp6v1h; Gm16041; Gm17101; Gm18956; Gm2053; Gm...
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11,intergenic,4732440D04Rik; Rb1cc1,4732440D04Rik; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Npbwr1; Rb1cc1...,...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...,intergenic,4732440D04Rik; Rb1cc1,4732440D04Rik; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Rb1cc1,4732440D04Rik; Gm19026; Gm2147; Npbwr1; Rb1cc1...,4732440D04Rik; Gm19026; Gm2147; Gm5694; Gm7182...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...,4732440D04Rik; Atp6v1h; Gm16041; Gm17101; Gm19...
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11,intergenic,Gm15452,Gm15452,Gm15452,Gm15452,...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,intergenic,Gm15452,Gm15452,Gm15452,Gm15452,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm15452;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13,2610203C22Rik,2610203C22Rik,2610203C22Rik; Adhfe1; Gm6161; Mybl1; Rrs1,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm18300;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Cops5; C...,...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...,2610203C22Rik,2610203C22Rik,2610203C22Rik; Adhfe1; Gm6161; Mybl1; Rrs1,1700034P13Rik; 2610203C22Rik; Adhfe1; Gm18300;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Cops5; C...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13,Gm6195; Sgk3,Gm6195; Sgk3,1700034P13Rik; Gm6195; Mcmdc2; Mybl1; Sgk3; Vc...,1700034P13Rik; 2610203C22Rik; Gm6195; Mcmdc2; ...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...,Gm6195; Sgk3,Gm6195; Sgk3,1700034P13Rik; Gm6195; Mcmdc2; Mybl1; Sgk3; Vc...,1700034P13Rik; 2610203C22Rik; Gm6195; Mcmdc2; ...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; Adhfe1; Arfgef1;...,1700034P13Rik; 2610203C22Rik; A830018L16Rik; A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX_166247682_166259932,X:166247682-166259932,1059,X,60.0,CTP11,Gpm6b,Gpm6b,Gemin8; Gpm6b,Gemin8; Gm15225; Gpm6b; Ofd1; Trappc2,Egfl6; Gemin8; Gm15223; Gm15225; Gm1720; Gpm6b...,...,Egfl6; Fancb; Gemin8; Glra2; Gm15223; Gm15225;...,Asb11; Asb9; Bmx; Egfl6; Fancb; Gemin8; Glra2;...,Gpm6b,Gpm6b,Gemin8; Gpm6b,Gemin8; Gm15225; Gpm6b; Ofd1; Trappc2,Egfl6; Gemin8; Gm15223; Gm15225; Gm1720; Gpm6b...,Egfl6; Gemin8; Glra2; Gm15223; Gm15225; Gm1523...,Egfl6; Fancb; Gemin8; Glra2; Gm15223; Gm15225;...,Asb11; Asb9; Bmx; Egfl6; Fancb; Gemin8; Glra2;...
chrX_167157164_167167452,X:167157164-167167452,990,X,61.0,CTP13,intergenic,Tmsb4x,Gm15232; Tmsb4x,Gm15232; Gm8814; Tlr7; Tmsb4x,Gm15230; Gm15232; Gm1720; Gm8814; Prps2; Tlr7;...,...,Egfl6; Frmpd4; Gemin8; Gm15223; Gm15225; Gm152...,Arhgap6; Egfl6; Frmpd4; Gemin8; Glra2; Gm15223...,intergenic,Tmsb4x,Gm15232; Tmsb4x,Gm15232; Gm8814; Tlr7; Tmsb4x,Gm15230; Gm15232; Gm1720; Gm8814; Prps2; Tlr7;...,Egfl6; Gemin8; Gm15223; Gm15230; Gm15232; Gm15...,Egfl6; Frmpd4; Gemin8; Gm15223; Gm15225; Gm152...,Arhgap6; Egfl6; Frmpd4; Gemin8; Glra2; Gm15223...
chrX_168746045_168757590,X:168746045-168757590,1060,X,62.0,CTP11,intergenic,Arhgap6,Arhgap6; Msl3,Arhgap6; Frmpd4; Msl3,Amelx; Arhgap6; Frmpd4; Msl3,...,Amelx; Arhgap6; Erdr1; Frmpd4; G530011O06Rik; ...,Amelx; Arhgap6; Asmt; Erdr1; Frmpd4; G530011O0...,intergenic,Arhgap6,Arhgap6; Msl3,Arhgap6; Frmpd4; Msl3,Amelx; Arhgap6; Frmpd4; Msl3,Amelx; Arhgap6; Frmpd4; Gm15238; Gm15240; Hccs...,Amelx; Arhgap6; Erdr1; Frmpd4; G530011O06Rik; ...,Amelx; Arhgap6; Asmt; Erdr1; Frmpd4; G530011O0...
chrX_169963295_170005197,X:169963295-170005197,991,X,63.0,CTP13,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik; Mid1,...,Amelx; Arhgap6; Asmt; Erdr1; Frmpd4; G530011O0...,Amelx; Arhgap6; Asmt; Erdr1; Frmpd4; G530011O0...,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik,Erdr1; G530011O06Rik; Mid1,Amelx; Asmt; Erdr1; G530011O06Rik; Hccs; Mid1,Amelx; Arhgap6; Asmt; Erdr1; Frmpd4; G530011O0...,Amelx; Arhgap6; Asmt; Erdr1; Frmpd4; G530011O0...


## process for major classes

In [15]:
groupby_adata = 'class_label_new'

np.unique(list(adata_ori.obs[groupby_adata]))
sel_class_to_process = [c for c in np.unique(list(adata_ori.obs[groupby_adata])) if c!='nan']
sel_class_to_process

['Astro', 'Endo', 'GABA', 'Gluta', 'Micro', 'Oligo']

In [16]:
# output_folder
output_folder = os.path.join(output_analysis_folder, r'10x_snRNA\10x_class')
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print ('Generate output folder')

Generate output folder


In [17]:
%matplotlib inline

import gene_activity
import loci_1d_features
from scipy import stats
import seaborn as sns
from tqdm import tqdm

bin_size = 2000 # extend both direction, thus equivilant to ~ 4Mb
adjcent_col = f'Adjacent_genes_{bin_size}kb_tss'

activity_type = 'sum' # sum of all gene associated to a loci for each single cell

expression_res_df_dict = {}
sel_class_to_process = [c for c in np.unique(list(adata_ori.obs[groupby_adata])) if c!='nan']

# simple loop
for _group in sel_class_to_process[:]:
    
    print (f'Process RNAseq data for {_group}')
    
    expression_res_dict={}
    sorted_group_order = [_group]

    imaged_loci_df_group = loci_1d_features.codebook_chr_order_for_loci_dataframe (imaged_loci_df, 
                                               codebook_df, 
                                               sel_cols =['chr','chr_order','id'], 
                                               sort_df = True,
                                               sort_by_chr=True)

    loci_key_list = loci_1d_features.sorted_loci_keys_for_loci_dataframe(imaged_loci_df_group)
    loci_ori_ind = loci_1d_features.find_chr_loci_iloc_from_loci_keys (codebook_df, loci_key_list)

    # for loci along the chromosome, append the measurements for each single cell
    for _ind, sel_loci_ind in tqdm(enumerate(imaged_loci_df_group.index.tolist()[:])):

        # get adjacent gene expression
        sel_genes=imaged_loci_df_group.loc[sel_loci_ind][f'Adjacent_genes_{bin_size}kb_tss'].split('; ')
        sel_adata =  adata_ori[:,adata_ori.var.index.isin(sel_genes)]
        marker_expressions = gene_activity.gene_activity_raw_groups(sel_genes,
                            sel_adata, 
                            sorted_group_order,
                            groupby_adata,
                            ref_norm_list = [],
                            report_type =activity_type)

        expression_res_dict[_ind]=list(marker_expressions[_group])
        
    # convert dict to df as loci by cell
    expression_res_df = pd.DataFrame.from_dict(expression_res_dict, orient='index')
    expression_res_df['loci_name']=codebook_df.index.tolist()
    expression_res_df = expression_res_df.set_index ('loci_name')
    expression_res_df_dict[_group]=expression_res_df
    # save
    output_df_fname = os.path.join(output_folder, f'MERFISH_loci_10x_expression_2X_{bin_size}kb_for_{_group}.csv')
    expression_res_df.to_csv(output_df_fname)
    print ('=========================================================')


Process RNAseq data for Astro


1982it [01:37, 20.26it/s]


Process RNAseq data for Endo


1982it [01:34, 21.04it/s]


Process RNAseq data for GABA


1982it [06:53,  4.80it/s]


Process RNAseq data for Gluta


1982it [32:00,  1.03it/s]


Process RNAseq data for Micro


1982it [01:36, 20.61it/s]


Process RNAseq data for Oligo


1982it [01:41, 19.61it/s]




In [18]:
expression_res_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,210,211,212,213,214,215,216,217,218,219
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_3742742_3759944,1.0,1.0,0.0,5.0,5.0,11.0,0.0,1.0,2.0,3.0,...,7.0,2.0,1.0,1.0,3.0,0.0,1.0,1.0,0.0,3.0
chr1_6245958_6258969,8.0,20.0,0.0,24.0,18.0,7.0,15.0,10.0,11.0,16.0,...,33.0,8.0,16.0,7.0,8.0,8.0,5.0,7.0,4.0,2.0
chr1_8740008_8759916,0.0,5.0,5.0,2.0,8.0,17.0,1.0,2.0,6.0,0.0,...,11.0,3.0,7.0,0.0,6.0,0.0,10.0,0.0,0.0,0.0
chr1_9627926_9637875,0.0,5.0,6.0,2.0,7.0,18.0,1.0,2.0,2.0,0.0,...,11.0,2.0,4.0,0.0,5.0,0.0,5.0,0.0,0.0,1.0
chr1_9799472_9811359,0.0,5.0,6.0,2.0,7.0,18.0,1.0,2.0,2.0,0.0,...,11.0,2.0,4.0,0.0,5.0,0.0,5.0,0.0,0.0,1.0


## process for subclasses

In [19]:
groupby_adata = 'subclass_label_new'

np.unique(list(adata_ori.obs[groupby_adata]))

array(['Astro', 'Endo', 'L2/3 IT', 'L4/5 IT', 'L5 ET', 'L5 IT', 'L5/6 NP',
       'L6 CT', 'L6 IT', 'L6b', 'Lamp5', 'Micro', 'OPC', 'Oligo', 'Pvalb',
       'Sncg', 'Sst', 'Vip'], dtype='<U7')

In [20]:
# output_folder
output_folder = os.path.join(output_analysis_folder, r'10x_snRNA\10x_subclass')
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print ('Generate output folder')

Generate output folder


In [21]:
%matplotlib inline

import gene_activity
import loci_1d_features
from scipy import stats
import seaborn as sns
from tqdm import tqdm

bin_size =2000 # extend both direction, thus equivilant to ~ 4Mb
adjcent_col = f'Adjacent_genes_{bin_size}kb_tss'
activity_type = 'sum' # sum of all gene associated to a loci for each single cell

expression_res_df_dict = {}

expression_res_df_dict = {}
sel_class_to_process = [c for c in np.unique(list(adata_ori.obs[groupby_adata])) if c!='nan']

# simple loop
for _group in sel_class_to_process[:]:
    
    print (f'Process RNAseq data for {_group}')
    
    expression_res_dict={}
    sorted_group_order = [_group]
    
    _group_fname = _group.replace('/','_').replace(' ','_')

    imaged_loci_df_group = loci_1d_features.codebook_chr_order_for_loci_dataframe (imaged_loci_df, 
                                               codebook_df, 
                                               sel_cols =['chr','chr_order','id'], 
                                               sort_df = True,
                                               sort_by_chr=True)

    loci_key_list = loci_1d_features.sorted_loci_keys_for_loci_dataframe(imaged_loci_df_group)
    loci_ori_ind = loci_1d_features.find_chr_loci_iloc_from_loci_keys (codebook_df, loci_key_list)

    # for loci along the chromosome, append the measurements for each single cell
    for _ind, sel_loci_ind in tqdm(enumerate(imaged_loci_df_group.index.tolist()[:])):

        # get adjacent gene expression
        sel_genes=imaged_loci_df_group.loc[sel_loci_ind][f'Adjacent_genes_{bin_size}kb_tss'].split('; ')
        sel_adata =  adata_ori[:,adata_ori.var.index.isin(sel_genes)]
        marker_expressions = gene_activity.gene_activity_raw_groups(sel_genes,
                            sel_adata, 
                            sorted_group_order,
                            groupby_adata,
                            ref_norm_list = [],
                            report_type =activity_type)

        expression_res_dict[_ind]=list(marker_expressions[_group])
        
    # convert dict to df as loci by cell
    expression_res_df = pd.DataFrame.from_dict(expression_res_dict, orient='index')
    expression_res_df['loci_name']=codebook_df.index.tolist()
    expression_res_df = expression_res_df.set_index ('loci_name')
    expression_res_df_dict[_group]=expression_res_df
    # save
    output_df_fname = os.path.join(output_folder, f'MERFISH_loci_10x_expression_2X_{bin_size}kb_for_{_group_fname}.csv')
    expression_res_df.to_csv(output_df_fname)
    print ('=========================================================')


Process RNAseq data for Astro


1982it [01:40, 19.72it/s]


Process RNAseq data for Endo


1982it [01:35, 20.67it/s]


Process RNAseq data for L2/3 IT


1982it [09:32,  3.46it/s]


Process RNAseq data for L4/5 IT


1982it [06:49,  4.85it/s]


Process RNAseq data for L5 ET


1982it [03:15, 10.16it/s]


Process RNAseq data for L5 IT


1982it [08:01,  4.12it/s]


Process RNAseq data for L5/6 NP


1982it [02:51, 11.58it/s]


Process RNAseq data for L6 CT


1982it [09:00,  3.66it/s]


Process RNAseq data for L6 IT


1982it [03:51,  8.54it/s]


Process RNAseq data for L6b


1982it [02:22, 13.91it/s]


Process RNAseq data for Lamp5


1982it [02:26, 13.48it/s]


Process RNAseq data for Micro


1982it [01:37, 20.26it/s]


Process RNAseq data for OPC


1982it [01:38, 20.04it/s]


Process RNAseq data for Oligo


1982it [01:40, 19.70it/s]


Process RNAseq data for Pvalb


1982it [04:02,  8.18it/s]


Process RNAseq data for Sncg


1982it [01:48, 18.29it/s]


Process RNAseq data for Sst


1982it [03:06, 10.61it/s]


Process RNAseq data for Vip


1982it [02:23, 13.84it/s]




In [22]:
expression_res_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_3742742_3759944,16.0,24.0,14.0,17.0,9.0,13.0,8.0,15.0,14.0,7.0,...,29.0,23.0,35.0,28.0,22.0,5.0,12.0,25.0,37.0,15.0
chr1_6245958_6258969,9.0,6.0,16.0,7.0,7.0,2.0,5.0,9.0,2.0,4.0,...,0.0,6.0,6.0,6.0,9.0,6.0,5.0,0.0,6.0,0.0
chr1_8740008_8759916,40.0,7.0,9.0,35.0,51.0,42.0,12.0,11.0,29.0,5.0,...,24.0,24.0,77.0,88.0,32.0,57.0,52.0,26.0,24.0,51.0
chr1_9627926_9637875,46.0,6.0,18.0,39.0,56.0,53.0,19.0,17.0,29.0,8.0,...,28.0,21.0,129.0,109.0,58.0,60.0,57.0,31.0,22.0,58.0
chr1_9799472_9811359,46.0,6.0,18.0,39.0,56.0,53.0,19.0,17.0,29.0,8.0,...,28.0,21.0,129.0,109.0,58.0,60.0,57.0,31.0,22.0,58.0
