# NEATseq analysis and enrichment

In [1]:
import pandas as pd
import numpy as np
import ctar
import anndata as ad

### Load NEATseq data
These files were generated from the raw CD4T cell data.

`neat_mu['rna']` contains the RNA gene expression. \
`neat_mu['atac']` contains the ATAC peak accessibility.

In [2]:
import muon as mu

In [3]:
neat_directory = '/projects/zhanglab/users/ana/multiome/raw/neatseq/'
neat_mu = mu.read(neat_directory+'neat.h5mu')



### Load CRISPRi FlowFISH

The original table directly from the Nasser Nature 2021 paper can be found in `crispr_directory + 'originals/crispr-flowfish-supp5'`. The -`_edited` version simply contains the following additional columns:
1. `ensgid`: the gene Ensembl IDs mapped using BioMart API
2. `unique_id`: contains identifiers for specific enhancer-gene-celltype links
3. `hg38_chr, hg38_start, hg38_end`: lifted over hg38 enhancer coordinates

The LiftOver tool and `hg19ToHg38.over.chain.gz` file can be found in `/projects/zhanglab/users/ana/liftover/liftOver`.

In [32]:
crispr_directory = '/projects/zhanglab/users/ana/multiome/validation/'
crispr = pd.read_csv(crispr_directory+'crispr-flowfish-supp5_edited.csv',index_col=0)

According to SCENT paper,
> We defined 283 positive enhancer element–gene links when they are ‘TRUE’ for ‘Regulated’ column (that is, the element–gene pair is significant and the effect size is negative) and 5,472 negative enhancer element–gene links when they are ‘FALSE’ for ‘Regulated’ column.

In [33]:
print('Positive links: ',crispr[crispr.Regulated == True].shape[0])
print('Negative links: ',crispr[crispr.Regulated == False].shape[0])

Positive links:  283
Negative links:  5472


# CTAR

### Step 1 (optional) : Get peak-gene pairs with +/-500kb around the gene body.
I already included `peak_gene_pairs` in `neat_mu.uns` but if you would like to do it from scratch the code is here.

In [None]:
# gets gene body coords and adds +/-500kb windows
genes = ctar.data_loader.get_gene_coords(neat_mu['rna'].var)
# adjust peak to bed format
peaks = neat_mu['atac'].var.copy()
peaks[['chr','start','end']] = peaks.peak.str.split(':|-',expand=True)
# gets intersection of peaks and +/-500kb windows around genes
peak_gene_pairs = ctar.data_loader.peak_to_gene(peaks,genes)
# add this to mu_neat.uns
neat_mu.uns['peak_gene_pairs'] = peak_gene_pairs
# note: takes around 1-2min.

### Step 2: Create AnnData object for regression analysis.
Aligns peaks and genes according to `peak_gene_pairs`.

In [4]:
neat = ctar.method.build_adata(neat_mu,gene_col='gene',peak_col='peak',raw=True)

In [5]:
# filter out lowly expressed peaks or genes with < 5% cells expressing.
neat,lowexp_mask = ctar.method.filter_vars(neat)

### Step 3: Get correlation coefficient.

In [6]:
neat = ctar.method.get_poiss_coeff(neat)

100%|██████████| 63677/63677 [07:46<00:00, 136.46it/s]


### Step 4: Get control peaks.

In [7]:
ctar.method.create_ctrl_peaks(neat,num_bins=10,gc=False,peak_col='peak')

MFA done.
Get_bins done.
Rand_peaks done.
Ctrl index array done.


array([[ 9166,  4294, 55416, ..., 35125, 37664, 16963],
       [44462, 62710, 56222, ..., 59625, 41396, 47153],
       [57558, 47313, 36189, ..., 62384, 35218, 53401],
       ...,
       [53072,  6914, 33308, ..., 60992, 16825, 56812],
       [26955, 11562, 27104, ..., 10817, 21193,  6387],
       [   20, 58346, 60147, ..., 21303, 42991, 23547]])

This part takes very long. It would be better to use the script `/projects/zhanglab/users/ana/multiome/results/ctar/ctar.sbatch` which will run in batches.

In [None]:
# save data
neat.write('neat.h5ad')

In [9]:
from tqdm import tqdm

In [10]:
atac = neat.layers['atac_raw'].A
ctrl_coeff = []
for i in tqdm(np.arange(atac.shape[1])):
    ctrl_coeff_i = []
    rna = neat[:,i].layers['rna_raw'].A
    ind = neat[:,i].varm['control_peaks'][0].astype(int)
    for j in np.arange(200): # generate coefficient for all controls in b = 200
        ctrl_coeff_i.append(ctar.method.fit_poisson(atac[:,[ind[j]]],
                                                    rna,return_none=False))
    ctrl_coeff.append(ctrl_coeff_i)
    
ctrl_coeff = np.array(ctrl_coeff)

  0%|          | 5/63677 [00:08<29:05:37,  1.64s/it]

KeyboardInterrupt



In [None]:
neat.varm['ctrl_poiss_coeff'] = ctrl_coeff

### Precomputed results
My results from this dataset.

In [12]:
results_directory = '/projects/zhanglab/users/ana/multiome/results/ctar/'
neat = ad.read_h5ad(results_directory+'neat.h5ad')
neat

AnnData object with n_obs × n_vars = 8472 × 63677
    obs: 'Sample', 'TSSEnrichment', 'ReadsInTSS', 'ReadsInPromoter', 'ReadsInBlacklist', 'PromoterRatio', 'PassQC', 'NucleosomeRatio', 'nMultiFrags', 'nMonoFrags', 'nFrags', 'nDiFrags', 'BlacklistRatio', 'Clusters', 'ReadsInPeaks', 'FRIP'
    var: 'peak', 'gene', 'distance', 'index_z', 'index_x', 'index_y', 'poiss_coeff', 'mc_pval', 'mc_qval'
    varm: 'control_peaks', 'control_poiss_coeff'
    layers: 'atac_raw', 'rna_raw'

# SCENT and Signac
SCENT and Signac must be run in R. Analysis is provided in the notebook `signac-scent.ipynb`. Precomputed results from those methods on the NEATseq dataset can be found here.

In [19]:
scent_directory = '/projects/zhanglab/users/ana/multiome/results/scent/'
scent_neat = pd.read_csv(scent_directory+'myscent_neatseq.txt',index_col=0)
# FDR < 0.1 results form the paper, edited to include Ensembl IDs mapped with BioMart
scent_neat_paper = pd.read_csv(scent_directory+'scentpaper_neatseq_edited.csv',index_col=0)

In [22]:
signac_directory = '/projects/zhanglab/users/ana/multiome/results/signac/'
signac_neat = pd.read_csv(signac_directory+'signac_neatseq_links_edited.csv',index_col=0)