In [1]:
import os
import glob
import pandas as pd 
#from chromolooper import sgls
from time import sleep
import pybedtools as pbt
import coolbox
from coolbox.api import *

pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/t1d-loop-catalog/')

outdir = 'results/hg38/finemapping/sgl_coolbox_visualizations/'
os.makedirs(outdir, exist_ok=True)

ModuleNotFoundError: No module named 'coolbox'

## Load all finemapped snp data

In [None]:
# add meta information
causal_metadata_fn = 'workflow/qscripts/finemap/causal_db/init.gwas_study.causal_db.immune_select_samples.tsv'
causal_metadata = pd.read_table(causal_metadata_fn, header=None)

causal_metadata_mapper =  causal_metadata.iloc[:, [2, 8, 18]]
causal_metadata_mapper.columns = ['mesh_term', 'author', 'filename']

FileNotFoundError: [Errno 2] No such file or directory: 'workflow/qscripts/finemap/causal_db/init.gwas_study.causal_db.immune_select_samples.tsv'

In [None]:
fns = glob.glob('results/hg38/finemapping/snps/singles/*_total_credible_set.hg38.txt')

all_data = []
for i, fn in enumerate(fns):

    info = fn.split('/')

    causaldb_fn = info[-1].split('_')[0]

    if causaldb_fn in causal_metadata_mapper.filename.tolist():

        # loading the data
        tdf = pd.read_table(fn)
        if tdf.shape[0] > 0:
            tdf.loc[:, 'causaldb_fn'] = causaldb_fn
            all_data.append(tdf)

: 

In [None]:
all_df = pd.concat(all_data)

: 

In [None]:
all_df = all_df.merge(causal_metadata_mapper, left_on='causaldb_fn', right_on='filename')

: 

In [None]:
block_data = []
slop = 1000000
for block_id, block_df in all_df.sort_values(['causaldb_fn', 'block_id', 'BP']).groupby(['causaldb_fn', 'block_id']):
    
    first_snp = block_df.iloc[0]
    last_snp = block_df.iloc[-1]
    
    chrom = 'chr{}'.format(first_snp.CHR)
    block_start = max([0, first_snp.BP - slop])
    block_end = last_snp.BP + slop
    tdata = [chrom, block_start, block_end, first_snp.causaldb_fn, first_snp.block_id]
    block_data.append(tdata)

: 

In [None]:
block_extensions = pd.DataFrame(block_data)
block_extensions_pbt = pbt.BedTool.from_dataframe(block_extensions)

: 

In [None]:
block_extensions.columns = ['chrom', 'start', 'end', 'causaldb_fn', 'block']
block_extensions['score'] = 1000
block_extensions['strand'] = '+'

: 

In [None]:
block_extensions.head()

: 

In [None]:
block_extensions.loc[block_extensions['causaldb_fn'] == 'PH378']

: 

In [None]:
# save the blocks as BED files
for causal_fn, causal_df in block_extensions.groupby('causaldb_fn'):
    tdf = causal_df.loc[:, ['chrom', 'start', 'end', 'block', 'score', 'strand']]
    #tdf = causal_df.loc[:, ['chrom', 'start', 'end']]
    outfn = os.path.join(outdir, '{}_block.bed'.format(causal_fn))
    tdf.to_csv(outfn, sep='\t', index=False, header=False)

: 

: 

: 

# Loading the gene data

In [None]:
gencode_df = pd.read_table('results/refs/ensembl/gencode.v30.annotation.w_genetypes.bed', header=None,
                          names=['chrom', 'start', 'end', 'strand', 'type', 'geneid', 'genename', 'genetype'])
gencode_df = gencode_df.loc[gencode_df.type == 'gene'].drop('type', axis=1)
gencode_df.loc[:,'geneid'] = gencode_df.loc[:,'geneid'].str.replace('.[0-9]*', '', regex=True)
gencode_df = gencode_df.loc[gencode_df.genetype == 'protein_coding']

: 

In [None]:
gencode_pbt = pbt.BedTool.from_dataframe(gencode_df)

: 

# Intersection

In [None]:
block_genes = block_extensions_pbt.intersect(gencode_pbt, wa=True, wb=True, header=True).to_dataframe()

: 

In [None]:
block_genes

: 

## Annotation 

In [None]:
# loading consensus genes
t1d_consensus_list_fn = '/mnt/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/gene_lists/consensus_gene_list.txt'
t1d_consensus_genes = pd.read_table(t1d_consensus_list_fn, header=None, names=['genename'])
print('The number of consensus genes is: {}'.format(t1d_consensus_genes.shape[0]))

t1d_consensus_genes = set(t1d_consensus_genes.genename)

: 

In [None]:
# loading SGL genes

sample_grps = {
    'CD4_Naive': ['CD4_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD4_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD4_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD4_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD4_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD4_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD4_Naive_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD4_Naive_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD4_Naive_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD4_Naive_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD4_Naive_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD4_Naive_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD4_Naive_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD4_Naive_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD4_Naive_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD4_Naive_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD4_Naive_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD4_Naive_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3'],
    'CD8_Naive': ['CD8_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD8_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD8_Naive_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD8_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD8_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD8_Naive_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD8_Naive_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD8_Naive_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD8_Naive_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD8_Naive_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD8_Naive_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD8_Naive_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD8_Naive_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD8_Naive_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD8_Naive_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'CD8_Naive_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'CD8_Naive_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'CD8_Naive_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3'],
    'Monocyte': ['Monocyte_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Monocyte_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Monocyte_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Monocyte_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Monocyte_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Monocyte_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Monocyte_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Monocyte_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Monocyte_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Monocyte_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Monocyte_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Monocyte_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Monocyte_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Monocyte_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Monocyte_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Monocyte_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Monocyte_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Monocyte_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'THP-1-WT.GSE149420.Homo_Sapiens.H3K27ac.b1'],
    'Naive_B': ['Naive_B_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Naive_B_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Naive_B_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Naive_B_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Naive_B_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Naive_B_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Naive_B_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Naive_B_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Naive_B_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Naive_B_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Naive_B_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Naive_B_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Naive_B_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Naive_B_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Naive_B_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Naive_B_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Naive_B_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Naive_B_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Nalm6.GSE115492.Homo_Sapiens.H3K27ac.b1'],
    'Natural_Killer': ['Natural_Killer_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Natural_Killer_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Natural_Killer_1800-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Natural_Killer_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Natural_Killer_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Natural_Killer_1814-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Natural_Killer_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Natural_Killer_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Natural_Killer_1815-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Natural_Killer_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Natural_Killer_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Natural_Killer_1816-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Natural_Killer_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Natural_Killer_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Natural_Killer_1829-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3',
                 'Natural_Killer_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
                 'Natural_Killer_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b2',
                 'Natural_Killer_1831-RH-1.phs001703v3p1.Homo_Sapiens.H3K27ac.b3'],
    'Nonclassical_Monocyte': ['Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b1',
                 'Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b2',
                 'Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b3',
                 'Nonclassical_Monocyte_1786.phs001703v4p1.Homo_Sapiens.H3K27ac.b4',
                 'Nonclassical_Monocyte_1800.phs001703v4p1.Homo_Sapiens.H3K27ac.b1',
                 'Nonclassical_Monocyte_1800.phs001703v4p1.Homo_Sapiens.H3K27ac.b2',
                 'Nonclassical_Monocyte_1800.phs001703v4p1.Homo_Sapiens.H3K27ac.b3',
                 'Nonclassical_Monocyte_1800.phs001703v4p1.Homo_Sapiens.H3K27ac.b4',
                 'Nonclassical_Monocyte_1831.phs001703v4p1.Homo_Sapiens.H3K27ac.b1',
                 'Nonclassical_Monocyte_1831.phs001703v4p1.Homo_Sapiens.H3K27ac.b2',
                 'Nonclassical_Monocyte_1831.phs001703v4p1.Homo_Sapiens.H3K27ac.b3',
                 'Nonclassical_Monocyte_1831.phs001703v4p1.Homo_Sapiens.H3K27ac.b4']
}

sample_grps2 = {}
for grp, grp_lists in sample_grps.items():
    for s in grp_lists:
        sample_grps2[s] = grp

fns = glob.glob('results/hg38/finemapping/sgls/*/*.finemap_sgls.tsv')

all_data = []
for fn in fns:
    
    # split meta information
    info = fn.split('/')
    genome = info[1]
    causaldb_fn = info[4]
    sample = info[5].rsplit('.', maxsplit=3)[0]

    # load sgl table
    tdf = pd.read_table(fn)
    tdf.loc[:, 'causaldb_fn'] = causaldb_fn
    tdf.loc[:, 'sample'] = sample
    tdf.loc[:, 'sample_grp'] = sample_grps2[sample]

    # append to all
    all_data.append(tdf)

all_df = pd.concat(all_data)

: 

In [None]:
sgl_genes = [x for x in all_df.genename.unique() if not x.startswith('AC') and not x.startswith('AL') and not x.startswith('HIST') and not x.startswith('AP')]
sgl_genes = set(sgl_genes)

: 

In [None]:
len(sgl_genes)

: 

In [None]:
sgl_genes_only = sgl_genes.difference(t1d_consensus_genes)
consensus_genes_only = t1d_consensus_genes.difference(sgl_genes)
shared = sgl_genes.intersection(t1d_consensus_genes)

: 

In [None]:
# annotate genes
def label_genes(x):
    if x.blockSizes in sgl_genes_only:
        return('sgl_genes_only')
    elif x.blockSizes in consensus_genes_only:
        return('consensus_genes_only')
    elif x.blockSizes in shared:
        return('shared')
    else:
        return('other')
    
block_genes.loc[:, 'gene_annotation'] = block_genes.apply(label_genes, axis=1)

: 

In [None]:
block_genes

: 

## Assign blocks to each gene case

In [None]:
# annotate each block
block_annos = []

for (causldb_fn, block_id), block_df in block_genes.groupby(['name', 'score']):
    gene_annos = block_df.gene_annotation.unique()
    
    if len(gene_annos) == 1:
        block_type = gene_annos[0]
    elif len(gene_annos) == 2:
        if 'other' in gene_annos:
            block_type = [x for x in gene_annos if x != 'other'][0]
        else:
            block_type = 'mixed'
    else:
        block_type = 'mixed'
        
    block_annos.append([causldb_fn, block_id, block_type])

block_annos_df = pd.DataFrame(block_annos, columns=['name', 'score', 'block_type'])

# merge information with gene-based information 
block_genes = block_genes.merge(block_annos_df, on=['name', 'score'])

: 

In [None]:
block_genes_check = block_genes.loc[block_genes.block_type == 'sgl_genes_only']
block_genes_check.loc[block_genes_check.score == 21].sort_values('gene_annotation')

: 

## Visualization

In [None]:
# create missing fields
def create_attribute(sr):
    attrib = 'gene_id "ENSG00000223972.5"; gene_name "{0}"; gene_type "protein_coding";'.format(sr.blockSizes)
    return(attrib)

block_genes_save = block_genes.copy()
block_genes_save.loc[:, 'source'] = 'block_genes'
block_genes_save.loc[:, 'score'] = '.'
block_genes_save.loc[:, 'frame'] = '.'
block_genes_save.loc[:, 'feature_type'] = 'gene'
block_genes_save.loc[:, 'attribute'] = block_genes_save.apply(create_attribute, axis=1)

: 

In [None]:
block_genes_save.head()

: 

In [None]:
# loading the gencode gtf
gencode_gtf_df = pd.read_table('results/hg38/refs/gencode/v30/gencode.v30.annotation.gtf.gz', comment='#', header=None)
gencode_gtf_df = gencode_gtf_df.loc[gencode_gtf_df.iloc[:, 2] == 'gene']
gencode_gtf_df = gencode_gtf_df.loc[gencode_gtf_df.iloc[:, 8].str.contains('protein_coding')]
gencode_gtf_df.loc[:, 'gene_name'] = gencode_gtf_df.iloc[:, 8].str.extract('gene_name "(.*?)";')[0]

: 

In [None]:
# save tracks for the genes within blocks
for causal_fn, causal_df in block_genes_save.groupby('name'):
    
    select_genes = list(causal_df.blockSizes.unique())
    tdf = gencode_gtf_df.loc[gencode_gtf_df.gene_name.isin(select_genes)]
    tdf.drop('gene_name', axis=1, inplace=True)
    
    outfn = os.path.join(outdir, '{}_block_genes.gtf'.format(causal_fn))
    tdf.to_csv(outfn, sep='\t', header=None, quotechar="'", index=False)

: 

In [None]:
full_snp_track = 'results/hg38/finemapping/snps/singles/PH378.finemapped.snps.cb.bed.bgz'
block_track = 'results/hg38/finemapping/sgl_coolbox_visualizations/PH378_block.bed'
genes_with_anno_track = 'results/hg38/finemapping/sgl_coolbox_visualizations/PH378_block_genes.gtf'
loop_track = 'results/hg38/finemapping/sgls_conserved_loops/PH378_sgl_loops.bedpe'
# sgl_track = ''

: 

: 

In [None]:
block_extensions.loc[:, 'query'] = block_extensions.apply(lambda sr: '{}:{}-{}'.format(sr.chrom, sr.start, sr.end), axis=1)

: 

In [None]:
check_ph378_block = block_extensions.loc[block_extensions.causaldb_fn == 'PH378']

: 

In [None]:
check_ph378_block

: 

: 

In [None]:
with TrackHeight(2):
    frame = XAxis()
    frame += BED(block_track, alpha=1) + Title("Expanded blocks")
    frame += Spacer()
    frame += BED(full_snp_track, alpha=1) + Title("Finemapped SNPs") + TrackHeight(4)
    frame += Spacer()
    frame += GTF(genes_with_anno_track, length_ratio_thresh=0.005) + TrackHeight(4) + Title("Genes within blocks")
    frame += Spacer()
    frame += Arcs(loop_track) + Inverted() + Title("Union of SGLs\nacross samples")

frame.properties['width'] = 60

test_interval = "chr1:16000000-19000000"
test_interval = 'chr2:191073437-191204687'
test_interval = check_ph378_block['query'].iloc[-1]

bsr = Browser(frame, reference_genome='hg38', widgets_box='simple')
bsr.goto(test_interval)

: 

In [None]:
bsr.show()

: 

: 

: 