# Method functions demo
This notebook demos ctar method functions. It requires the file `pbmc10k_csc.h5mu` which can be found in `./projects/zhanglab/users/ana`.

In this notebook, you will create an AnnData object with:
- aligned RNA and ATACseq data
- correlations between defined peak-gene pairs
- control correlations using random peaks matched for GC content and MFA
- Monte Carlo p-value based on those controls

The latter half of this notebook features some functions for CT-specific analysis that are still a work in progress.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd 
import statsmodels.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import scdrs
import math
import warnings
import time
import random
from tqdm import tqdm

import pybedtools
from itertools import compress
from Bio.SeqUtils import GC

import anndata as ad
import scanpy as sc
import muon as mu

import ctar

In [2]:
# Load muData object
mdata = mu.read('pbmc10k_csc.h5mu')



In [3]:
# For the purposes of demo, take a small subset
mini = mdata[:50,:]

# Add cell type labels
labels = {
    0:'CD4+ naïve T', 1:'CD4+ memory T', 2:'MAIT',
    3:'CD8+ naïve T', 4:'CD8+ activated T', 5:'NK',
    6:'naïve B', 7:'memory B',
    8:'CD14 mono', 9:'intermediate mono', 10:'mDC', 11:'pDC'}

mini.uns['ct_labels'] = labels

In [4]:
# Get peak_gene_pairs from muData
ctar.method.find_peak_gene_pairs(mini)
mini.uns['peak_gene_pairs'].head()

Using peak_annotation index as gene_name column.
Using peak as gene_ids column.


  mdata['atac'].var['index'] = range(len(atac.var))


Using rna.var index as gene_name column.


Unnamed: 0,gene_ids,index_x,gene_name,distance,peak_type,index_y
1,chr1:180599-181702,1,AL627309.5,-6738,distal,1.0
2,chr1:191168-192093,2,AL627309.5,-17307,distal,1.0
10,chr1:774742-775615,10,LINC01409,-3143,distal,4.0
11,chr1:778283-779200,11,LINC01409,0,promoter,4.0
12,chr1:816877-817776,12,FAM87B,0,promoter,5.0


In [5]:
# Build an AnnData object with aligned peak-gene pairs
ctar_adata = ctar.method.build_adata(mini)
# Add CT labels to this new AnnData too
ctar_adata.uns['ct_labels'] = labels

# Ensure all labels are unique (AnnData function)
ctar_adata.var_names_make_unique()

# Get correlation between atac and rna layers
ctar_adata = ctar.method.get_corrs(ctar_adata)

ctar_adata

AnnData object with n_obs × n_vars = 30 × 44230
    obs: 'n_genes_by_counts', 'total_counts', 'NS', 'nucleosome_signal', 'n_counts', 'leiden', 'celltype'
    var: 'gene_ids', 'index_x', 'gene_name', 'distance', 'peak_type', 'index_y', 'corr'
    uns: 'ct_labels'
    varm: 'lowexp_ct_mask'
    layers: 'atac', 'rna', 'atac_raw', 'rna_raw'

In [6]:
random.seed(6)

# Add control peak indices to ctar_adata.varm
# We will use this to calculate control corrs for matched random atac peaks
# Note that fetching sequencing and recording GC content takes a bit long.
ctar.method.create_ctrl_peaks(ctar_adata)

MFA done.
GC done.
Get_bins done.
Rand_peaks done.
Ctrl index array done.


array([[41925,   773, 15906, ...,  1154, 21427,   578],
       [22655, 19327,  6547, ..., 42976, 42968,  4243],
       [24070, 26372, 30659, ..., 21938, 10970, 21321],
       ...,
       [30148, 31861, 30684, ...,  2349,  8462, 40545],
       [11486, 35205, 24963, ..., 23562, 35284,  8145],
       [12697, 13003, 17175, ..., 42983, 16969, 22550]])

In [7]:
# Using ctar_adata.varm.control_peaks, calculate control corr
# and add it to ctar_adata.varm
ctar.method.control_corr(ctar_adata)

100%|██████████| 44230/44230 [00:48<00:00, 908.76it/s]


array([[-0.07072625,  0.35570648, -0.18710272, ..., -0.06760585,
         0.0203484 , -0.11070114],
       [-0.11630739,  0.03897239,  0.20377669, ..., -0.0617177 ,
        -0.04831771,  0.22521447],
       [-0.04961354, -0.03448276, -0.03448276, ...,  0.29227549,
        -0.03448277,  1.00000036],
       ...,
       [-0.11471381,  0.1692903 ,  0.317931  , ..., -0.16466704,
        -0.16258687, -0.16250041],
       [-0.20475599,  0.05010165,  0.06129436, ...,  0.38961723,
        -0.20574735,  0.43382499],
       [ 0.14853226, -0.14865822, -0.21542181, ...,  0.09325566,
        -0.03370514, -0.17493862]])

In [10]:
# Obtain Monte Carlo p-value based on putative and control corrs
ctar.method.get_pvals(ctar_adata)
ctar_adata.var

Unnamed: 0_level_0,gene_ids,index_x,gene_name,distance,peak_type,index_y,corr,mc_pval,mc_qval
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"chr1:778283-779200 , LINC01409",chr1:778283-779200,11,LINC01409,0,promoter,4.0,-0.240365,0.058160,0.999979
"chr1:827066-827949 , LINC01128",chr1:827066-827949,16,LINC01128,0,promoter,6.0,0.275780,0.898466,0.999979
"chr1:844161-845024 , LINC01128",chr1:844161-845024,19,LINC01128,-15203,distal,6.0,-0.049096,0.509170,0.999979
"chr1:857911-858643 , LINC01128",chr1:857911-858643,21,LINC01128,-1584,distal,6.0,-0.049506,0.491017,0.999979
"chr1:955171-956100 , NOC2L",chr1:955171-956100,36,NOC2L,3156,distal,12.0,-0.061435,0.443785,0.999979
...,...,...,...,...,...,...,...,...,...
"GL000219.1:44638-45549 , AL592183.1",GL000219.1:44638-45549,143855,AL592183.1,37762,distal,26336.0,-0.114714,0.223689,0.999979
"GL000219.1:99255-100162 , AL592183.1",GL000219.1:99255-100162,143860,AL592183.1,-15945,distal,26336.0,-0.259409,0.023385,0.999979
"GL000219.1:132809-133702 , AL592183.1",GL000219.1:132809-133702,143862,AL592183.1,-49499,distal,26336.0,-0.114714,0.183113,0.999979
"GL000219.1:147576-148506 , AL592183.1",GL000219.1:147576-148506,143863,AL592183.1,-64266,distal,26336.0,-0.114714,0.193257,0.999979


## Cell type specific analysis

In [12]:
# Let's analyze CD4+ Naive T cells
ctar_adata.uns['ct_labels'][0]

'CD4+ naïve T'

In [13]:
# Build CT-specific and CT-excluded AnnData for CD4+ Naive T cells
cd4nt_ctar = ctar.method.build_ct_adata(ctar_adata,0)
cd4nt_ctar = ctar.method.get_corrs(cd4nt_ctar)
cd4nt_other = ctar.method.build_other_adata(ctar_adata,0)
cd4nt_other = ctar.method.get_corrs(cd4nt_other)

# Find control using existing matched random peaks, but now only across CD4+ nT cells
# Also, there are very few cells and genes at this point that fit our requirements (CT and highly expressed
# in CT) so this will run quickly
ctar.method.control_corr(cd4nt_other,ct=True)
ctar.method.control_corr(cd4nt_ctar,ct=True)

  ct_adata.uns['original_atac'] = adata.layers['atac'][(adata.obs['celltype'] == ct),:]
100%|██████████| 3/3 [00:00<00:00, 447.28it/s]
100%|██████████| 3/3 [00:00<00:00, 479.84it/s]


array([[ 0.23777074, -0.1857129 , -0.01201359, ...,  0.34318352,
        -0.03300482, -0.18880543],
       [-0.34243852, -0.2996245 , -0.2098659 , ..., -0.20952232,
        -0.47171083, -0.00985467],
       [-0.35502446,  0.        , -0.3121056 , ..., -0.50428075,
        -0.19550517,  0.17241183]])

In [15]:
# Get delta correlation
ctar.method.get_deltas(cd4nt_ctar,cd4nt_other)
# Get delta control correlation
ctar.method.get_control_deltas(cd4nt_ctar,cd4nt_other)

array([[ 0.3163319 ,  0.08213753, -0.22716037, ...,  0.41604015,
         0.11178135, -0.63127795],
       [-0.93861723, -0.47960043, -0.20346838, ..., -0.36112483,
        -0.24382317, -0.24350478],
       [-0.28752637, -0.15350766, -0.49064864, ..., -0.45471568,
        -0.426753  , -0.01267682]])

In [18]:
# Get MC p-values
ctar.method.get_pvals(cd4nt_ctar,metric='delta_corr',control_metric='delta_control_corr')
cd4nt_ctar.var.head()

Unnamed: 0_level_0,gene_ids,index_x,gene_name,distance,peak_type,index_y,mc_qval,corr,delta_corr,mc_pval
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"chr12:92145507-92146436 , BTG1",chr12:92145507-92146436,32410,BTG1,0,promoter,16043.0,0.94002,0.330448,0.306604,0.826724
"chr19:5680305-5681203 , RPL36",chr19:5680305-5681203,63231,RPL36,0,promoter,22404.0,0.94002,-0.129266,-0.099197,0.487171
"chr22:40044059-40044996 , TNRC6B",chr22:40044059-40044996,87479,TNRC6B,0,promoter,25318.0,0.94002,0.369976,0.549274,0.94002
