# Import module

The link to get [ImageAnalysis3](https://github.com/zhengpuas47/ImageAnalysis3) 

Or from the Zhuang lab archived [source_tools](https://github.com/ZhuangLab/Chromatin_Analysis_2020_cell/tree/master/sequential_tracing/source)

## ImageAnalysis3 and basic modules

In [1]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
from ImageAnalysis3 import *
from ImageAnalysis3.classes import _allowed_kwds

import h5py
import ast
import pandas as pd

print(os.getpid())

10780


## Chromatin_analysis_tools etc

See **functions** in the repository for [AnalysisTool_Chromatin](../../README.md)

In [2]:
# Chromatin_analysis_tools (ATC)
# Get path for the py containing functions
import os
import sys
import importlib
module_path =r'C:\Users\shiwei\Documents\AnalysisTool_Chromatin'
if module_path not in sys.path:
    sys.path.append(module_path)
    
# import relevant modules
import gene_selection 
importlib.reload(gene_selection)
import gene_to_loci
importlib.reload(gene_to_loci)
import gene_activity
importlib.reload(gene_activity)
import loci_1d_features
importlib.reload(loci_1d_features)  

import atac_to_loci
importlib.reload(atac_to_loci)

<module 'atac_to_loci' from 'C:\\Users\\shiwei\\Documents\\AnalysisTool_Chromatin\\atac_to_loci.py'>

# Define folders

In [3]:
# main folder for postanalysis
postanalysis_folder = r'L:\Shiwei\postanalysis_2024\v0'
# input files for postanalysis
input_folder = os.path.join(postanalysis_folder, 'resources_from_preprocess')

# output file to be generated
output_main_folder = os.path.join(postanalysis_folder, 'compartment_transcription')

output_analysis_folder = os.path.join(output_main_folder, 'analysis')
output_figure_folder = os.path.join(output_main_folder, 'figures')

# make new folder if needed
make_output_folder = True

if make_output_folder and not os.path.exists(output_analysis_folder):
    os.makedirs(output_analysis_folder)
    print(f'Generating analysis folder: {output_analysis_folder}.')
elif os.path.exists(output_analysis_folder):
    print(f'Use existing analysis folder: {output_analysis_folder}.')
    
if make_output_folder and not os.path.exists(output_figure_folder):
    os.makedirs(output_figure_folder)
    print(f'Generating figure folder: {output_figure_folder}.')
elif os.path.exists(output_figure_folder):
    print(f'Use existing figure folder: {output_figure_folder}.')

Use existing analysis folder: L:\Shiwei\postanalysis_2024\v0\compartment_transcription\analysis.
Use existing figure folder: L:\Shiwei\postanalysis_2024\v0\compartment_transcription\figures.


# Plotting parameters

In [4]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
plt.rc('font', family='serif')
plt.rc('font', serif='Arial')

from ImageAnalysis3.figure_tools import _double_col_width, _single_col_width, _font_size, _ticklabel_size,_ticklabel_width

import seaborn as sns
sns.set_context("paper", rc={"font.size":_font_size,"axes.titlesize":_font_size+1,"axes.labelsize":_font_size})  

In [5]:
# Other required plotting parameters
_dpi = 300
_font_size = 7
_page_width = 5.5


## cell type color-codes

In [6]:
# cell labels from RNA-MERFISH and celltype prediction
selected_cell_labels = ['L2/3 IT','L4/5 IT','L5 IT','L6 IT','L5 ET','L5/6 NP','L6 CT','L6b',
                           'Sst','Pvalb','Lamp5','Sncg','Vip',
                           'Astro','Oligo','OPC','Micro','Endo','VLMC','SMC','Peri', 
                           #'other',
                          ]
# cell palette from RNA-MERFISH UMAP and stats
celltype_palette = {'Astro':'lightcoral', 
                    'Endo':'skyblue', 
                    'L2/3 IT':'gold', 
                    'L4/5 IT':'darkorange', 
                    'L5 ET':'mediumseagreen', 
                    'L5 IT':'aqua',
                    'L5/6 NP':'darkgreen',
                    'L6 CT':'brown',
                    'L6 IT':'magenta',
                    'L6b':'blue', 
                    'Lamp5':'orange', 
                    'Micro':'peachpuff',
                    'OPC':'thistle', 
                    'Oligo':'darkviolet',
                    'Peri':'sandybrown',
                    'Pvalb':'springgreen',
                    'SMC':'rosybrown',
                    'Sncg':'darkkhaki',
                    'Sst':'steelblue', 
                    'VLMC':'saddlebrown', 
                    'Vip':'red',
                    'other':'slategray'}


In [7]:
# this is the plotting order noted based on the snRNA transcriptional acitivty if needed
sorted_cellplot_order_byRNA = ['Micro', 'Oligo', 'Endo', 'OPC', 'Astro', 'Vip', 'Lamp5',
                  'L5/6 NP', 'Sst', 'Sncg', 'Pvalb', 'L4/5 IT', 'L6 CT',
                  'L6 IT', 'L6b', 'L2/3 IT', 'L5 IT', 'L5 ET']

# Load data relevant information

## load and format codebook

[merged codebook](../resources/merged_codebook.csv) as in the repository (merged for all DNA-MERFISH libraries)

In [8]:
# Load codebook 
codebook_fname = os.path.join(input_folder,'merged_codebook.csv')
codebook_df = pd.read_csv (codebook_fname, index_col=0)

# sort df by chr and chr_order
codebook_df = loci_1d_features.sort_loci_df_by_chr_order (codebook_df)
codebook_df.head()

Unnamed: 0,name,id,NDB_784,NDB_755,NDB_826,NDB_713,NDB_865,NDB_725,NDB_817,NDB_710,...,NDB_479,NDB_562,NDB_608,NDB_460,NDB_563,NDB_592,NDB_368,NDB_436,NDB_629,NDB_604
0,1:3742742-3759944,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1:6245958-6258969,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1:8740008-8759916,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,1:9627926-9637875,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1017,1:9799472-9811359,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
# Format the chr loci name by 
# 1. changing loci name format
# 2. extract relevant information such as id, chr, chr_order, and library etc
from gene_to_loci import loci_pos_format
loci_name_list = list(map(loci_pos_format, codebook_df['name'].tolist()))
loci_name_arr = np.array(loci_name_list)

# convert to a new dataframe and set loci name as index
codebook_df = codebook_df[['name','id','chr','chr_order','library']]
codebook_df['loci_name'] = list(loci_name_arr[:,0])
codebook_df = codebook_df.set_index ('loci_name')

codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13


# Load pre-summarized AB density medians

Data can be generated from notebook

[2_2_summarize_ab_density_all_loci_subclassby_ensemble](2_2_summarize_ab_density_all_loci_subclassby_ensemble.ipynb)

In [10]:
# the calculated AB desnity ratio for single-cell
#compartment_folder = r'\\10.245.74.158\Chromatin_NAS_8\Analyzed_data\MouseBrain_PostAnalysis'
AB_summary_fname = os.path.join(output_analysis_folder , r'Median_Summary_AB_trans_ratio_notNorm_r0.5_bymajorType_by_pm.csv')

AB_ratio_summary = pd.read_csv(AB_summary_fname, index_col=0)

source_from_CW = False
if source_from_CW:
    AB_ratio_summary['chr'] = AB_ratio_summary.index
    AB_ratio_summary = AB_ratio_summary.set_index('loci_name')
    loci_chr_id_map = {(_chr, _order):v for _chr, _order, v in zip(df_refgen['chr'],
                                                                   df_refgen['chr_order'],
                                                                   df_refgen['loci_name'],)}
    AB_ratio_summary['loci_name'] = AB_ratio_summary.apply(lambda row: loci_chr_id_map[row['chr'], row['hyb']], axis=1)
    AB_ratio_summary = pd.pivot(AB_ratio_summary, index='loci_name', columns='subclass', values='AB_density')
    
AB_ratio_summary

Unnamed: 0_level_0,Oligo,OPC,Micro,Astro,Endo,Peri,L2/3 IT,L4/5 IT,L5 IT,L6 IT,L5 ET,L6 CT,L5/6 NP,L6b,Vip,Pvalb,Lamp5,Sst,Sncg
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
chr1_3742742_3759944,-0.430767,-0.420269,-0.255699,-0.453668,0.132135,-0.140469,-0.560620,-0.460501,-0.446779,-0.488172,-0.421529,-0.455810,-0.304009,-0.472670,-0.114805,-0.289297,-0.309272,-0.325902,-0.186869
chr1_6245958_6258969,-0.153712,-0.206244,-0.047506,-0.226636,0.387080,0.133823,-0.447946,-0.445162,-0.487868,-0.408086,-0.356980,-0.453940,-0.201882,-0.332559,-0.207538,-0.219555,-0.207729,-0.116463,-0.435847
chr1_8740008_8759916,-0.425727,-0.148089,-0.275335,-0.409980,0.080992,0.116621,-0.400666,-0.314289,-0.418091,-0.412230,-0.292372,-0.487285,-0.505100,-0.498226,-0.032649,-0.221845,-0.257861,-0.206280,-0.333840
chr1_9627926_9637875,-0.254296,0.003753,0.080439,-0.209209,0.287116,0.149472,-0.350141,-0.286409,-0.402433,-0.200084,-0.307043,-0.290176,-0.298747,-0.386660,-0.279638,-0.146983,-0.086232,-0.194100,-0.302644
chr1_9799472_9811359,-0.116000,-0.083814,0.234512,-0.073540,0.492773,0.359415,-0.246629,-0.303253,-0.354788,-0.248334,-0.232831,-0.253817,-0.417956,-0.329905,-0.355471,-0.403508,-0.072171,-0.268272,-0.355233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX_166247682_166259932,0.190443,0.086648,0.045090,0.202411,0.315556,0.383854,-0.157234,-0.122200,-0.238115,-0.214688,-0.064618,-0.077107,-0.332221,-0.080294,-0.227362,-0.265453,-0.371283,-0.219211,-0.668820
chrX_167157164_167167452,0.069587,-0.185342,0.435091,0.024904,0.626812,0.500466,-0.203417,-0.119909,-0.157937,-0.201859,-0.364895,-0.162953,0.024588,-0.201043,-0.109136,-0.048074,-0.359983,-0.203461,-0.322641
chrX_168746045_168757590,-0.138497,-0.285900,-0.019425,-0.079555,0.332414,0.382330,-0.297829,-0.237427,-0.256210,-0.317645,-0.250351,-0.389821,0.137434,-0.329275,-0.261027,-0.155011,-0.287720,-0.211167,-0.682236
chrX_169963295_170005197,-0.095872,0.023907,0.074545,0.013206,0.551413,0.311509,-0.235026,-0.220536,-0.293816,-0.380180,-0.200907,-0.360787,-0.075992,-0.169335,-0.296937,-0.053206,-0.307064,-0.229188,-0.193384


## calculate regression factor between celltypes

In [11]:
### get Abratio normalization dict
from scipy import stats
from sklearn.linear_model import LinearRegression

# use common ref to regress which has more number of data
regress_each = False
fit_intercept=True
ab_regression_coef_dict = {}

#ref_cls = 'Gluta'
# do regression for each class as ref_cls because the downstream marker gene analysis will be similar



if not regress_each:
    ref_cls = 'L2/3 IT'
    for _cls in AB_ratio_summary.columns:
        xs = np.array(AB_ratio_summary[ref_cls].tolist())
        ys = np.array(AB_ratio_summary[_cls].tolist())
        # correlation
        xs_good = xs[(~np.isnan(xs)&(~np.isnan(ys)))]
        ys_good = ys[(~np.isnan(xs)&(~np.isnan(ys)))]
        X=xs_good.reshape(-1, 1)
        y=ys_good.reshape(-1, 1)

        if len(X) > 0:
            reg = LinearRegression(fit_intercept=fit_intercept).fit(X, y)
            ab_regression_coef_dict[_cls] = (reg.coef_[0], reg.intercept_)
        else:
            ab_regression_coef_dict[_cls] = (np.nan, np.nan)

    
else:
    for ref_cls in AB_ratio_summary.columns:
        ab_regression_coef_dict[ref_cls] = {}

        for _cls in AB_ratio_summary.columns:
            xs = np.array(AB_ratio_summary[ref_cls].tolist())
            ys = np.array(AB_ratio_summary[_cls].tolist())
            # correlation
            xs_good = xs[(~np.isnan(xs)&(~np.isnan(ys)))]
            ys_good = ys[(~np.isnan(xs)&(~np.isnan(ys)))]
            X=xs_good.reshape(-1, 1)
            y=ys_good.reshape(-1, 1)

            if len(X) > 0:
                reg = LinearRegression(fit_intercept=fit_intercept).fit(X, y)
                ab_regression_coef_dict[ref_cls][_cls] = (reg.coef_[0], reg.intercept_)
            else:
                ab_regression_coef_dict[ref_cls][_cls] = (np.nan, np.nan)

ab_regression_coef_dict

{'Oligo': (array([0.77329371]), array([0.12521121])),
 'OPC': (array([0.71597982]), array([0.10633413])),
 'Micro': (array([0.68456728]), array([0.21108297])),
 'Astro': (array([0.69342696]), array([0.15908295])),
 'Endo': (array([0.44780838]), array([0.49364204])),
 'Peri': (array([0.49270282]), array([0.48223804])),
 'L2/3 IT': (array([1.]), array([0.])),
 'L4/5 IT': (array([0.86112884]), array([-0.00277903])),
 'L5 IT': (array([0.77141948]), array([-0.03423645])),
 'L6 IT': (array([0.85293924]), array([-0.01805797])),
 'L5 ET': (array([0.63679152]), array([-0.04722389])),
 'L6 CT': (array([0.89872086]), array([-0.01063253])),
 'L5/6 NP': (array([0.58499424]), array([-0.0513388])),
 'L6b': (array([0.57913252]), array([-0.04630086])),
 'Vip': (array([0.46624995]), array([-0.04151498])),
 'Pvalb': (array([0.58214168]), array([-0.03671979])),
 'Lamp5': (array([0.49064784]), array([-0.01925425])),
 'Sst': (array([0.48761871]), array([-0.02805633])),
 'Sncg': (array([0.49812969]), array([

# Process all cellgroups for superenhancers

## define shared parameters

In [12]:
# the cell groups to be analyzed
sorted_group_order = ['L2/3 IT', 'L4/5 IT', 'L5 IT',
                       'L6 IT', 'L5 ET', 'L6 CT', 'L5/6 NP', 'L6b', 'Vip', 'Pvalb', 'Lamp5',
                       'Sst', ]

In [13]:
# re-import functions
import loci_1d_features as lf
import gene_selection as gs
import gene_to_loci as gl

re_calculate_AB = False

# use SE that is unique to only one cell type 
unique_SE = True

In [14]:
celltype_codebook_fname = os.path.join(input_folder,'merged_codebook_w_celltype_w_rank.csv')
celltype_codebook_df = pd.read_csv (celltype_codebook_fname, index_col=0)

# sort df by chr and chr_order
celltype_codebook_df = loci_1d_features.sort_loci_df_by_chr_order (celltype_codebook_df)

# Format the chr loci name by 
# 1. changing loci name format
# 2. extract relevant information such as id, chr, chr_order, and library etc
from gene_to_loci import loci_pos_format
celltype_codebook_df['name'] = celltype_codebook_df.index
loci_name_list = list(map(loci_pos_format, celltype_codebook_df['name'].tolist()))
loci_name_arr = np.array(loci_name_list)

# convert to a new dataframe and set loci name as index
celltype_codebook_df = celltype_codebook_df[['name','id','chr','chr_order','library','cell_type','enhancer_name']]
celltype_codebook_df['loci_name'] = list(loci_name_arr[:,0])
celltype_codebook_df = celltype_codebook_df.set_index ('loci_name')

celltype_codebook_df.head()

Unnamed: 0_level_0,name,id,chr,chr_order,library,cell_type,enhancer_name
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
chr1_3742742_3759944,1:3742742-3759944,1,1,0.0,CTP11,['None'],['None']
chr1_6245958_6258969,1:6245958-6258969,2,1,1.0,CTP11,['None'],['None']
chr1_8740008_8759916,1:8740008-8759916,3,1,2.0,CTP11,['None'],['None']
chr1_9627926_9637875,1:9627926-9637875,1,1,3.0,CTP13,['L6_CT'],1:9610471-9646458
chr1_9799472_9811359,1:9799472-9811359,2,1,4.0,CTP13,['L2_3_IT'],1:9759317-9835402


## loop to process each cell group

In [15]:
# compile result as (marker loci of group) by (median of sc for group)
compiled_df = pd.DataFrame(columns = sorted_group_order)
np.random.seed(42)

for _marker_group in sorted_group_order[:]:
#for _marker_group in ['L5 ET','Micro']:    
    print (f'Process SE loci for {_marker_group}.')  
    ##############################################################################
    # 1. get specific SE-loci inds
    # modify group name consistency
    _marker_group_rename = _marker_group.replace('/','_').replace(' ','_')
    if _marker_group_rename == 'Micro':
        _marker_group_rename_SE = 'MicroPVM'
    elif _marker_group_rename == 'L5_ET':
        _marker_group_rename_SE = 'L5_PT'
    else:
        _marker_group_rename_SE = _marker_group_rename

    print(_marker_group_rename_SE)

    if unique_SE:
        im_loci_df = celltype_codebook_df[celltype_codebook_df['cell_type']==f"['{_marker_group_rename_SE}']"]
    else:
        im_loci_df = celltype_codebook_df[celltype_codebook_df['cell_type'].str.contains(_marker_group_rename_SE)]
    #sort loci
    im_loci_df  = lf.codebook_chr_order_for_loci_dataframe  (im_loci_df, codebook_df,sel_cols =['chr','chr_order','id'], 
                                               sort_df = True,
                                               sort_by_chr= True)
    
    # add expression change for
    im_loci_df['Expression_change']='upregulated'
    ##############################################################################
    # 2. add radom ctp13 genome loci as control
    non_up_loci_df = celltype_codebook_df[~celltype_codebook_df['cell_type'].str.contains(_marker_group_rename_SE)]
    non_up_loci_df = non_up_loci_df[~non_up_loci_df['cell_type'].str.contains('None')]
    # 2' if add radom ctp11 genome loci as control
    #non_up_loci_df = celltype_codebook_df[celltype_codebook_df['cell_type']=="['None']"]
    for _iter in range(0,100):
        random_loci = np.random.choice(non_up_loci_df.index,100, replace=False)
    random_loci_df = celltype_codebook_df.loc[random_loci]
    random_loci_df['Expression_change']='random_control'
    
    im_loci_df=pd.concat([im_loci_df,random_loci_df])
    #drop barcode cols
    keep_cols = [_c for _c in im_loci_df.columns if 'NDB' not in _c]
    keep_cols = [_c for _c in keep_cols if 'Stv' not in _c]
    im_loci_df=im_loci_df[keep_cols]
    
    
    ##############################################################################

    ##############################################################################
    # 2. load AB ratio for relevant loci
    for _dir in ['upregulated','random_control'][:]:

        sel_im_loci_df = im_loci_df[im_loci_df['Expression_change']==_dir]
        # initate result df here so marker of each direction can be independently saved
        _marker_group_df = pd.DataFrame(columns = sorted_group_order)
        # get chr and chr_order
        sel_loci_key_list = lf.sorted_loci_keys_for_loci_dataframe(sel_im_loci_df)

        # use chr and chr_order to retrieve AB ratio from single cell
        for loci_key in sel_loci_key_list:
            # append result for each loci from single cell
            # function to get the AB ratio for all cell group and append together
            if re_calculate_AB:
                _loci_group_df = loci_1d_features.sc_compartment_ratio_by_loci_key (AB_ratio_by_group, 
                                                                               sorted_group_order, 
                                                                                loci_key, 
                                                                                report_type = 'median',
                                                                                average_ratios_in_cell=True, 
                                                                                spot_num_th=600)
            else:
                _loci_name = codebook_df[(codebook_df['chr']==str(loci_key[0])) & (codebook_df['chr_order']==loci_key[1])].index[0]
                if 'chrX' not in _loci_name:
                    _loci_group_df = pd.DataFrame(AB_ratio_summary.loc[_loci_name]).transpose()
                    _loci_group_df = _loci_group_df.reset_index(drop=True)

                
            if not regress_each:
                for _group in sorted_group_order:
                    m, b = ab_regression_coef_dict[_group][0],ab_regression_coef_dict[_group][1]
                    _loci_group_df.loc[0, _group] = (_loci_group_df.loc[0, _group]-b)/m 


            # concat loci
            _marker_group_df=pd.concat([_marker_group_df,_loci_group_df])


        # add loci, expression, group, gene info, etc
        _marker_group_df['loci_name'] = sel_im_loci_df.index.tolist()
        _marker_group_df['enhancer_name'] = sel_im_loci_df['enhancer_name'].tolist()
        _marker_group_df['Marker_group']=_marker_group
        _marker_group_df['Expression_change']=_dir

        # regression to normalize ABratio using the marker group as ref_cls
        if regress_each:
            for _group in sorted_group_order:
                m, b = ab_regression_coef_dict[_marker_group][_group][0],ab_regression_coef_dict[_marker_group][_group][1]
                _marker_group_df.loc[:, _group] = (_marker_group_df.loc[:, _group]-b)/m 
            # concat loci
            _marker_group_df=pd.concat([_marker_group_df,_loci_group_df])

        # concat for the marker cell group                  
        compiled_df=pd.concat([compiled_df,_marker_group_df])

    print('================================================================================')

# set loci_name as index 
compiled_df=compiled_df.set_index('loci_name')    

Process SE loci for L2/3 IT.
L2_3_IT
Process SE loci for L4/5 IT.
L4_5_IT
Process SE loci for L5 IT.
L5_IT
Process SE loci for L6 IT.
L6_IT
Process SE loci for L5 ET.
L5_PT
Process SE loci for L6 CT.
L6_CT
Process SE loci for L5/6 NP.
L5_6_NP
Process SE loci for L6b.
L6b
Process SE loci for Vip.
Vip
Process SE loci for Pvalb.
Pvalb
Process SE loci for Lamp5.
Lamp5
Process SE loci for Sst.
Sst


In [16]:
compiled_df.head(10)

Unnamed: 0_level_0,L2/3 IT,L4/5 IT,L5 IT,L6 IT,L5 ET,L6 CT,L5/6 NP,L6b,Vip,Pvalb,...,Oligo,OPC,Micro,Astro,Endo,Peri,Sncg,enhancer_name,Marker_group,Expression_change
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_9799472_9811359,-0.246629,-0.348931,-0.415534,-0.269979,-0.291472,-0.27059,-0.626703,-0.489705,-0.673363,-0.630066,...,-0.116,-0.083814,0.234512,-0.07354,0.492773,0.359415,-0.355233,1:9759317-9835402,L2/3 IT,upregulated
chr1_21522568_21534512,0.157988,0.177744,0.104602,0.134259,-0.301557,-0.356024,-0.0526,-0.521367,-0.094214,-0.230084,...,-0.307273,-0.311462,-0.231718,-0.260091,0.163623,0.21247,-0.053455,1:21485446-21554433,L2/3 IT,upregulated
chr1_62926942_62936078,-0.100317,-0.186968,-0.101095,-0.103074,-0.243412,-0.207737,0.019004,-0.116716,-0.104934,-0.069307,...,0.141556,0.001419,0.224659,0.189864,0.547692,0.661718,0.074894,1:62900963-62948089,L2/3 IT,upregulated
chr2_135670388_135680581,0.014333,0.050795,-0.122257,-0.000108,-0.186657,-0.017936,0.251213,-0.147799,-0.482289,-0.124177,...,0.034957,0.001804,0.086982,0.18808,0.871917,0.577283,-0.016565,2:135658540-135684141,L2/3 IT,upregulated
chr3_157181787_157191783,-0.037845,-0.052078,-0.223777,-0.208545,-0.208042,-0.158589,-0.164689,-0.004944,-0.290141,-0.352956,...,-0.112965,-0.003305,-0.247953,0.016224,0.286308,0.259769,-0.129659,3:157149588-157216143,L2/3 IT,upregulated
chr4_8587053_8598914,-0.545159,-0.434658,-0.48029,-0.515397,-0.290258,-0.348028,-0.125066,-0.507469,-0.213955,-0.422294,...,-0.107257,-0.127963,0.221188,-0.051064,0.513435,0.41479,-0.245148,4:8556598-8611298,L2/3 IT,upregulated
chr5_44692530_44701463,-0.082847,-0.078817,-0.239958,-0.089679,-0.157831,-0.173129,-0.054545,-0.223415,-0.776268,-0.468562,...,-0.172726,-0.163259,-0.026733,0.008044,0.419258,0.343656,-0.269089,5:44646554-44742497,L2/3 IT,upregulated
chr6_17135040_17145456,-0.475776,-0.641122,-0.664534,-0.397868,-0.604248,-0.598666,-0.67372,-0.620775,-0.372006,-1.304393,...,-0.383223,-0.242896,-0.176918,-0.195448,0.402019,0.229576,-0.119791,6:17102402-17171916,L2/3 IT,upregulated
chr6_118622298_118630305,0.504521,0.523629,0.481617,0.548814,0.232272,0.467074,0.38845,0.335871,0.270484,0.288255,...,0.383697,0.371156,0.201901,0.320695,0.55136,0.810908,0.135921,6:118593580-118649770,L2/3 IT,upregulated
chr8_8625672_8635363,-0.177647,-0.175592,0.010411,-0.135772,0.029133,-0.184167,-0.197762,-0.20831,-0.156238,-0.290043,...,-0.201971,-0.216499,-0.110555,-0.208502,0.445225,0.298462,-0.382071,8:8600019-8650327,L2/3 IT,upregulated


In [17]:
_groupby_savename = 'subclass'
compiled_df_savename = os.path.join(output_analysis_folder,f'{_groupby_savename}_SE_AB_trans_ratio_regressed_heatmap_pm_majorType.csv')
compiled_df.to_csv(compiled_df_savename)
compiled_df

Unnamed: 0_level_0,L2/3 IT,L4/5 IT,L5 IT,L6 IT,L5 ET,L6 CT,L5/6 NP,L6b,Vip,Pvalb,...,Oligo,OPC,Micro,Astro,Endo,Peri,Sncg,enhancer_name,Marker_group,Expression_change
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_9799472_9811359,-0.246629,-0.348931,-0.415534,-0.269979,-0.291472,-0.270590,-0.626703,-0.489705,-0.673363,-0.630066,...,-0.116000,-0.083814,0.234512,-0.073540,0.492773,0.359415,-0.355233,1:9759317-9835402,L2/3 IT,upregulated
chr1_21522568_21534512,0.157988,0.177744,0.104602,0.134259,-0.301557,-0.356024,-0.052600,-0.521367,-0.094214,-0.230084,...,-0.307273,-0.311462,-0.231718,-0.260091,0.163623,0.212470,-0.053455,1:21485446-21554433,L2/3 IT,upregulated
chr1_62926942_62936078,-0.100317,-0.186968,-0.101095,-0.103074,-0.243412,-0.207737,0.019004,-0.116716,-0.104934,-0.069307,...,0.141556,0.001419,0.224659,0.189864,0.547692,0.661718,0.074894,1:62900963-62948089,L2/3 IT,upregulated
chr2_135670388_135680581,0.014333,0.050795,-0.122257,-0.000108,-0.186657,-0.017936,0.251213,-0.147799,-0.482289,-0.124177,...,0.034957,0.001804,0.086982,0.188080,0.871917,0.577283,-0.016565,2:135658540-135684141,L2/3 IT,upregulated
chr3_157181787_157191783,-0.037845,-0.052078,-0.223777,-0.208545,-0.208042,-0.158589,-0.164689,-0.004944,-0.290141,-0.352956,...,-0.112965,-0.003305,-0.247953,0.016224,0.286308,0.259769,-0.129659,3:157149588-157216143,L2/3 IT,upregulated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr1_21522568_21534512,0.157988,0.177744,0.104602,0.134259,-0.301557,-0.356024,-0.052600,-0.521367,-0.094214,-0.230084,...,-0.307273,-0.311462,-0.231718,-0.260091,0.163623,0.212470,-0.053455,1:21485446-21554433,Sst,random_control
chr5_126260039_126271505,0.651816,0.697825,0.742660,0.739041,0.679804,0.730284,0.557458,0.570463,0.320638,0.750361,...,0.705887,0.529660,0.254934,0.454784,0.302275,0.517443,0.296106,5:126238032-126282172,Sst,random_control
chr12_13114005_13122283,-0.380858,-0.335854,-0.479643,-0.410447,-0.382271,-0.407790,-0.871882,-0.635019,0.030348,-0.198165,...,-0.175269,-0.178913,-0.118585,-0.089060,0.504227,0.389654,-0.156835,12:13079761-13145339,Sst,random_control
chr17_73913910_73922684,0.051063,-0.081972,-0.133891,0.078060,-0.213973,0.008236,-0.058412,-0.117514,0.255262,0.080011,...,-0.078926,-0.005074,0.136457,-0.061259,0.563861,0.429495,0.080657,17:73881483-73942612,Sst,random_control


## normalize the heatmap to relative foldchange

In [18]:
norm_compiled_df = pd.DataFrame(columns=sorted_group_order)

# simply loop to get the ref value (aka the value from the marker group)
ref_value_list = []
for _row_df in compiled_df.iloc():
    ref_group = _row_df['Marker_group']
    ref_value_list.append(_row_df[ref_group])

# norm the relevant columns-dervied matrix
mat = compiled_df[sorted_group_order].to_numpy()
ref_arr = np.array(ref_value_list)[:, np.newaxis]
norm_mat = mat-ref_arr

# add back relevant info
norm_compiled_df[sorted_group_order]=pd.DataFrame(norm_mat)
norm_compiled_df.index = compiled_df.index
for _col in ['enhancer_name','Marker_group','Expression_change']:
    norm_compiled_df[_col]=compiled_df[_col]

norm_compiled_df

Unnamed: 0_level_0,L2/3 IT,L4/5 IT,L5 IT,L6 IT,L5 ET,L6 CT,L5/6 NP,L6b,Vip,Pvalb,Lamp5,Sst,enhancer_name,Marker_group,Expression_change
loci_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
chr1_9799472_9811359,0.000000,-0.102301,-0.168905,-0.023350,-0.044843,-0.023960,-0.380073,-0.243076,-0.426734,-0.383437,0.138778,-0.246001,1:9759317-9835402,L2/3 IT,upregulated
chr1_21522568_21534512,0.000000,0.019756,-0.053386,-0.023729,-0.459545,-0.514012,-0.210588,-0.679355,-0.252202,-0.388072,-0.056369,-0.586402,1:21485446-21554433,L2/3 IT,upregulated
chr1_62926942_62936078,0.000000,-0.086651,-0.000778,-0.002757,-0.143095,-0.107420,0.119321,-0.016398,-0.004617,0.031010,0.277695,-0.205005,1:62900963-62948089,L2/3 IT,upregulated
chr2_135670388_135680581,0.000000,0.036462,-0.136590,-0.014441,-0.200990,-0.032269,0.236881,-0.162132,-0.496622,-0.138510,0.378745,-0.249080,2:135658540-135684141,L2/3 IT,upregulated
chr3_157181787_157191783,0.000000,-0.014233,-0.185932,-0.170700,-0.170197,-0.120744,-0.126844,0.032901,-0.252296,-0.315111,-0.177691,0.108569,3:157149588-157216143,L2/3 IT,upregulated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr1_21522568_21534512,0.586402,0.606159,0.533016,0.562673,0.126857,0.072390,0.375815,-0.092952,0.334201,0.198330,0.530033,0.000000,1:21485446-21554433,Sst,random_control
chr5_126260039_126271505,0.012783,0.058792,0.103627,0.100008,0.040771,0.091252,-0.081575,-0.068570,-0.318395,0.111328,0.197487,0.000000,5:126238032-126282172,Sst,random_control
chr12_13114005_13122283,-0.280377,-0.235374,-0.379163,-0.309967,-0.281791,-0.307309,-0.771401,-0.534538,0.130829,-0.097684,-0.339021,0.000000,12:13079761-13145339,Sst,random_control
chr17_73913910_73922684,-0.002295,-0.135330,-0.187249,0.024702,-0.267331,-0.045121,-0.111770,-0.170872,0.201904,0.026653,0.297093,0.000000,17:73881483-73942612,Sst,random_control


In [19]:
norm_compiled_df_savename = os.path.join(output_analysis_folder,f'{_groupby_savename}_SE_AB_trans_ratio_regressed_heatmap_norm_pm_majorType.csv')
norm_compiled_df.to_csv(norm_compiled_df_savename)