# Attempting to overlap CLIP signal with conservation phastcon scores.
- careful of the regions chosen for noncoding_exon and noncoding_intron.
- Improvement over first pass which did not consider premature boundaries, this notebook is careful that each plotted region is completely within a CDS/UTR/Intron.

In [1]:
# %matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tnrange, tqdm_notebook
sys.path.insert(0, '/home/bay001/projects/codebase/rbp-maps/maps/')
from density import Map
from density import ReadDensity
from density import normalization_functions
from density import RDPlotter
import glob
import functools
import pybedtools
from qtools import Submitter
colors = sns.color_palette('hls',14)
center_dir = '/home/bay001/projects/encode/analysis/conservation_analysis/idr_peak_centers2'


because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



# Create single point bedfiles for each peak
- each peak 'center' will be used to define the peak.

In [2]:
suffix_in = 'merged.bed'
suffix_out = 'merged.center.bed'

def get_midpoint_and_create_bed(fin, fout):
    """
    From a BED file, create a new BED file containing the start/stop as the midpoint/midpoint+1
    """
    peak = pd.read_table(
        fin, sep='\t', names=['chrom','start','stop','name','score','strand']
    )
    peak['mid'] = ((peak['start'] + peak['stop'])/2).astype(int)
    peak['mid1'] = peak['mid'] + 1
    peak[['chrom','mid','mid1','name','score','strand']].to_csv(fout, sep='\t', header=False, index=False)

def run_cell_wrapper(center_dir):
    """
    Since we really only do this once, wrap this cell into a function and run when needed.
    """
    all_beds = glob.glob(os.path.join(center_dir, "*{}".format(suffix_in)))
    progress = tnrange(len(all_beds))
    for bed in all_beds:
        fout = bed.replace(suffix_in, suffix_out)
        get_midpoint_and_create_bed(bed, fout)
        progress.update(1)
        
run_cell_wrapper(center_dir)

          181/|/100%|| 181/181 [00:30<00:00,  9.11it/s]

# Annotate all center beds
- Runs the annotation script to generate annotations for each peak center.
- Need to annotate each center peak since no guarantee that overlapped broadpeaks will be annotated the same way.

In [3]:
suffix_in = 'merged.center.bed'
suffix_out = 'merged.center.bed.annotated'

center_beds = glob.glob(os.path.join(center_dir, "*{}".format(suffix_in)))
cmds = []
for bed in center_beds:
    annotated = bed.replace(suffix_in, suffix_out)
    if not os.path.exists(annotated):
        cmd = 'python /home/bay001/projects/codebase/annotator/annotate.py '
        cmd = cmd + '--gtfdb /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db '
        cmd = cmd + '--input {} '.format(bed)
        cmd = cmd + '--output {}'.format(annotated)
        cmds.append(cmd)
job_name = 'annotate_peakcenters'
sh = '/home/bay001/projects/encode/analysis/conservation_analysis/bash_scripts/annotate_peakcenters.sh'

Submitter(
    commands=cmds, job_name=job_name, sh=sh, array=True, submit=True, queue='home-yeo', ppn=4,
    walltime='4:00:00'
)

Writing 181 tasks as an array-job.
Wrote commands to /home/bay001/projects/encode/analysis/conservation_analysis/bash_scripts/annotate_peakcenters.sh.
Submitted script to queue home-yeo.
 Job ID: 8935733


<qtools.submitter.Submitter at 0x2b7d69cbab50>

# Parse center beds to create a twobed file
- Use the annotated transcript region as upper/lower boundaries for each peak center.
- 'twobed' will be a file with 12 columns: the first 6 columns will describe everything 'less than position wise' the peak center, the second 6 will describe anything 'greater than position wise' the peak center. 

In [9]:
suffix_in = 'merged.center.bed.annotated'
suffix_out = 'merged.center.bed.annotated.twobed'
### 'merged.center.bed.annotated.CDS.twobed'
### 'merged.center.bed.annotated.3UTR.twobed'
### 'merged.center.bed.annotated.5UTR.twobed'
### 'merged.center.bed.annotated.intron.twobed'
### 'merged.center.bed.annotated.noncoding_intron.twobed'
### 'merged.center.bed.annotated.noncoding_exon.twobed'

def parse_feature(row, col):
    """
    Parses each annotated row and returns the 'col' value 
    Or returns INTERGENIC if the row has no other annotation.
    
    col : int
    
    """
    if 'INTERGENIC' not in row['priority']:
        return row['priority'].split(':')[col]
    return 'INTERGENIC'

def annotated_to_twobed(annotated, twobed_out):
    """
    Writes a 'twobed' file using the annotated 'center peak' file created before.
    """
    df = pd.read_table(
        annotated,
        names=['chrom','start','stop','p','f','strand','priority','annotation']
    )
    df['region'] = df.apply(functools.partial(parse_feature, col=4), axis=1)
    df['start_boundary'] = df.apply(functools.partial(parse_feature, col=1), axis=1)
    df['stop_boundary'] = df.apply(functools.partial(parse_feature, col=2), axis=1)
    twobed = df[['chrom','start_boundary','stop','region','f','strand','chrom','stop','stop_boundary','region','f','strand']]
    twobed.to_csv(twobed_out, sep='\t', header=False, index=False)
    for region in set(df['region']):
        df_r = df[df['region']==region]
        twobed_r = df_r[['chrom','start_boundary','stop','region','f','strand',
                         'chrom','stop','stop_boundary','region','f','strand'
                        ]]
        twobed_r.to_csv(twobed_out.replace('.twobed','.{}.twobed'.format(region)), sep='\t', index=False, header=False)

# for all annotated center files, create a twobed file.

all_annotated = glob.glob(os.path.join(center_dir, '*{}'.format(suffix_in)))
progress = tnrange(len(all_annotated))

for annotated in all_annotated:
    twobed = annotated.replace(suffix_in, suffix_out)
    annotated_to_twobed(annotated, twobed)
    progress.update(1)

# Annotate 'full peaks' bedfiles using the annotations from the 'center' bedfiles
- re-join the annotated 'center' beds to the 'full peak' beds. we need to do this because the 'peak center annotated' bed files will not always match the 'peak annotated' bed files (due to possible overlapping features)
- This creates a fully annotated list of full peaks for each region

In [10]:
full_peak_suffix_in = 'merged.bed'
center_peak_suffix_in = 'merged.center.bed.annotated'

# THESE FILES ARE CREATED BUT NOT EXPLICITLY NAMED AS full_peak_suffix_out

### full_peak_suffix_out = 'merged.bed.annotated.CDS.bed'
### full_peak_suffix_out = 'merged.bed.annotated.3UTR.bed'
### full_peak_suffix_out = 'merged.bed.annotated.5UTR.bed'
### full_peak_suffix_out = 'merged.bed.annotated.intron.bed'
### full_peak_suffix_out = 'merged.bed.annotated.noncoding_intron.bed'
### full_peak_suffix_out = 'merged.bed.annotated.noncoding_exon.bed'

genome = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'

regions_bedfiles = {
    'CDS':'/home/bay001/projects/encode/analysis/conservation_analysis/region_bedfiles/hg19_v19_cds.bed',
    '3UTR':'/home/bay001/projects/encode/analysis/conservation_analysis/region_bedfiles/hg19_v19_three_prime_utrs.bed',
    '5UTR':'/home/bay001/projects/encode/analysis/conservation_analysis/region_bedfiles/hg19_v19_five_prime_utrs.bed',
    'intron':'/home/bay001/projects/encode/analysis/conservation_analysis/region_bedfiles/hg19_v19_introns.bed',
    'noncoding_intron':'/home/bay001/projects/encode/analysis/conservation_analysis/region_bedfiles/hg19_v19_introns.bed',
    'noncoding_exon':'/home/bay001/projects/encode/analysis/conservation_analysis/region_bedfiles/hg19_v19_exons.bed'
}


# dff = 'full peak' dataframe
# dfc = 'center peak' dataframe
# dfm = 'merged full + center' annotated dataframe
# dfr = 'merged full + center' annotated dataframe split into regions


def merge_and_parse_center_peaks(peak_center_bed_annotated, peak_full_bed):
    """
    merges the 'center-annotated' peaks with the full unannotated peaks to return
    the full peak list annotated.
    """
    dff = pd.read_table(peak_full_bed, names=['chrom','start','stop','name','score','strand'])
    dfc = pd.read_table(peak_center_bed_annotated, names=['chrom','c_start','c_stop','name','score','strand', 'priority', 'annotation'])
    dfc['region'] = dfc.apply(functools.partial(parse_feature, col=4), axis=1)
    dff['c_start'] = ((dff['start'] + dff['stop'])/2).astype(int)
    dff['c_stop'] = dff['c_start'] + 1
    dfm = pd.merge(dfc, dff, how='left', on=['chrom','c_start','c_stop','strand'])
    dfm[['chrom','start','stop','region','score_x','strand']]
    for region in set(dfm['region']):
        dfr = dfm[dfm['region']==region]
        dfr = dfr[['chrom','start','stop','priority','score_x','strand']]
        dfr.to_csv(peak_full_bed.replace('merged.bed','merged.bed.annotated.{}.bed'.format(region)), sep='\t', header=False, index=False)

all_peaks = glob.glob(os.path.join(center_dir, '*{}'.format(full_peak_suffix_in)))
progress = tnrange(len(all_peaks))
for peak in all_peaks:
    peak_center = peak.replace(full_peak_suffix_in, center_peak_suffix_in)
    if os.path.exists(peak) and os.path.exists(peak_center):
        merge_and_parse_center_peaks(
            peak_center, peak
        )
    progress.update(1)

# Generate random bedfiles from these 'full peak' bedfiles
- use the annotations to decide which region to bin each peak into.
- this produces the same number of random peak files for each region (region is defined by the 'center peak' locations, random peak length is defined by the 'full peak' lengths).

In [11]:
### suffix_in = '*merged.bed.annotated.{}.bed'.format(region)
### suffix_out = '*merged.bed.annotated.{}.RAND.bed'.format(region)

regions = ['CDS','3UTR','5UTR','intron']
cmds = []
for region in regions:
    suffix_in = '*merged.bed.annotated.{}.bed'.format(region)
    region_peaks = glob.glob(os.path.join(center_dir, suffix_in))
    region_bed = regions_bedfiles[region]
    progress = tnrange(len(region_peaks), desc=region, leave=False)
    
    for peak_file in region_peaks:
        if not os.path.exists(
            peak_file.replace(".{}.bed".format(region), ".{}.RAND.bed".format(region))
        ):
            cmd = 'bedtools shuffle '
            cmd = cmd + '-i {} '.format(peak_file)
            cmd = cmd + '-g {} '.format(genome)
            cmd = cmd + '-chrom '
            cmd = cmd + '-incl {} '.format(region_bed)
            cmd = cmd + '> {}'.format(peak_file.replace(".{}.bed".format(region), ".{}.RAND.bed".format(region)))
            cmds.append(cmd)
        progress.update(1)
        
job_name = 'random_shuffle'
sh = '/home/bay001/projects/encode/analysis/conservation_analysis/bash_scripts/random_shuffle.sh'

Submitter(
    commands=cmds, 
    job_name=job_name, 
    sh=sh, 
    array=True, submit=True, 
    queue='home-yeo', ppn=1,
    walltime='12:00:00'
)

Writing 500 tasks as an array-job.
Wrote commands to /home/bay001/projects/encode/analysis/conservation_analysis/bash_scripts/random_shuffle1.sh.
Submitted script to queue home-yeo.
 Job ID: 8936152
Writing 209 tasks as an array-job.
Wrote commands to /home/bay001/projects/encode/analysis/conservation_analysis/bash_scripts/random_shuffle2.sh.
Submitted script to queue home-yeo.
 Job ID: 8936153


<qtools.submitter.Submitter at 0x2b7d69fedcd0>

          181/|/100%|| 181/181 [00:15<00:00, 559.02it/s]

# Now re-annotate these regions
- we need to do this to get the new boundaries for each region (cannot use the older boundaries as these are new random locations)
- we CANNOT annotate the 'random peak centers' since due to peaks spanning multiple regions, we should not expect all 'centerized' points to fall in the original region.
- use a special priority list that gives first priority to whatever region we expect. So for a '3UTR' list of random-center peaks first priority needs to be given to the 3'UTR
- i want to make this unstranded, because sometimes the annotator will not find anything because the region exists on the opposite strand... causing some random peaks to be called as 'intergenic'. Right now, I don't care about strandedness (this is random right?), and only care about the fact that all of these peaks are annotated as 'intronic/CDS/UTR/etc.' and that I can get the proper boundaries for each region.

In [12]:
suffix_in = 'RAND.bed'
suffix_out = 'RAND.bed.annotated'

regions = ['CDS','3UTR','5UTR','intron']
priority_dir = '/home/bay001/projects/encode/analysis/conservation_analysis/priorty_lists/' # this is jsut the same priority list, except we push each respective region to the top.
cmds = []
for region in regions:
    priority = os.path.join(priority_dir, '{}priority.txt'.format(region))

    center_beds = glob.glob(os.path.join(center_dir, "*{}.{}".format(region, suffix_in)))
    
    for bed in center_beds:
        annotated = bed.replace(suffix_in, suffix_out)
        if not os.path.exists(annotated):
            cmd = 'python /home/bay001/projects/codebase/annotator/annotate.py '
            cmd = cmd + '--gtfdb /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db '
            cmd = cmd + '--input {} '.format(bed)
            cmd = cmd + '--output {} '.format(annotated)
            cmd = cmd + '--transcript-priority-file {} '.format(priority)
            cmd = cmd + '--gene-priority-file {} '.format(priority)
            cmd = cmd + '--unstranded'
            cmds.append(cmd)
job_name = 'annotate_rand_peaks'
sh = '/home/bay001/projects/encode/analysis/conservation_analysis/bash_scripts/annotate_rand_peaks.sh'

Submitter(
    commands=cmds, job_name=job_name, sh=sh, array=True, submit=False, queue='home-yeo', ppn=8,
    walltime='4:00:00'
)

Writing 461 tasks as an array-job.
Wrote commands to /home/bay001/projects/encode/analysis/conservation_analysis/bash_scripts/annotate_rand_peaks.sh.


<qtools.submitter.Submitter at 0x2b7d6a387690>

# Generate the 'center-bed' of these RAND regions
- basically make the 'center.RAND' bed files
- since the full peaks are guaranteed to be completely contained within each region, the 'centers' should also be completely contained.

In [None]:
suffix_in = 'RAND.bed.annotated'
suffix_out = 'RAND.center.bed.annotated'

def get_midpoint_and_create_bed(fin, fout):
    """
    From a BED file, create a new BED file containing the start/stop as the midpoint/midpoint+1
    """
    peak = pd.read_table(
        fin, sep='\t', names=['chrom','start','stop','name','score','strand','priority','annotation']
    )
    peak['mid'] = ((peak['start'] + peak['stop'])/2).astype(int)
    peak['mid1'] = peak['mid'] + 1
    peak[['chrom','mid','mid1','priority','score','strand']].to_csv(fout, sep='\t', header=False, index=False)

    
def run_wrapper(center_dir):
    """
    Since we really only do this once, wrap this cell into a function and run when needed.
    """
    all_beds = glob.glob(os.path.join(center_dir, "*{}".format(suffix_in)))
    print("found {} beds".format(len(all_beds)))
    progress = tnrange(len(all_beds))
    for bed in all_beds:
        fout = bed.replace(suffix_in, suffix_out)
        get_midpoint_and_create_bed(bed, fout)
        progress.update(1)
run_wrapper(center_dir)

# Then create twobed files from these center bed files

In [None]:
suffix_in = '.RAND.center.bed.annotated'
suffix_out = '.RAND.center.bed.annotated.twobed'

def parse_feature(row, col):
    """
    This is identical to the 'parse_feature' function before, 
    but now we look at the 'name' instead of 'priority'
    
    Parses each annotated row and returns the 'col' value 
    Or returns INTERGENIC if the row has no other annotation.
    
    my annotation string looks like: 
        transcript_id:txstart:txend:strand:region:gene_id:gene_name:coding/noncoding:overlap
        
    col : int
    
    """
    if 'INTERGENIC' not in row['name']:
        return row['name'].split(':')[col]
    return 'INTERGENIC'

def truncate_on_boundaries(row):
    """
    takes four positions: start-boundary, start, stop, stop-boundary
    and uses the 'start/stop' boundaries as hard cutoffs for a given region.
    If either start/stop fall outside of the boundary regions, return the midpoint of the 
    boundaries, guaranteeing that we stay within. 
    """
    if row['start'] <= int(row['start_boundary']) or row['stop'] >= int(row['stop_boundary']):
        return int((int(row['start_boundary']) + int(row['stop_boundary']))/2)
    else:
        return row['start']

def annotated_rand_to_twobed(annotated, twobed_out):
    """
    Writes a 'twobed' file using the annotated 'center peak' file created before.
    
    * bedtools shuffle created non-completely-overlapping regions that only 
    partially cover the -incl CDS/intron/UTR regions and are NOT strand specific
    (ie. 
        [chr16   22111599        22111697] was created using the bedtools command:
        
        wd=/home/bay001/projects/encode/analysis/conservation_analysis/idr_peak_centers
        
        bedtools shuffle -i \
        ${wd}/idr_peak_centers/693.01v02.IDR.out.0102merged.bed.annotated.CDS.bed -g \
        /projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes \
        -chrom \
        -incl ${wd}/region_bedfiles/hg19_v19_cds.bed > \
        ${wd}/idr_peak_centers/693.01v02.IDR.out.0102merged.bed.annotated.CDS.RAND.bed
        
        where chr16:22111599-22111697 is not completely contained within any of 
        the cds.bed intervals.
    )
    
    *** THEREFORE let's just hack the region to return just the upper/lower boundaries of the real region.
    
    """
    try:
        df = pd.read_table(
            annotated,
            names=['chrom','start','stop','name','f','strand']
        )

        df['start_boundary'] = df.apply(functools.partial(parse_feature, col=1), axis=1)
        df['stop_boundary'] = df.apply(functools.partial(parse_feature, col=2), axis=1)
        df['region'] = df.apply(functools.partial(parse_feature, col=4), axis=1)
        
        twobed = df[[
            'chrom','start_boundary','start','region','f','strand',
            'chrom','start','stop_boundary','region','f','strand'
        ]]
        twobed.columns = [ # rename just the start -> stop column
            'chrom','start_boundary','start','region','f','strand',
            'chrom','stop','stop_boundary','region','f','strand'
        ]
        # assert twobed['start_boundary'] < twobed['stop_boundary']  # these MUST define a proper transcript region.
        twobed['start'] = twobed.apply(truncate_on_boundaries, axis=1)
        twobed['stop'] = twobed.apply(truncate_on_boundaries, axis=1)
        # assert twobed['start'] == twobed['stop']
        
        twobed.to_csv(twobed_out, sep='\t', index=False, header=False)
    except ValueError as e:
        print(annotated, e)
# for all annotated center files, create a twobed file.

all_annotated = glob.glob(os.path.join(center_dir, '*{}'.format(suffix_in)))
progress = tnrange(len(all_annotated))

for annotated in all_annotated:
    twobed = annotated.replace(suffix_in, suffix_out)
    annotated_rand_to_twobed(annotated, twobed)
    progress.update(1)

# Generate matrices for each map
- now we have: 1) a 'centerized' bedfile annotated about the feature that contains it, and 2) a random 'centerized' bedfile annotated about its feature that contains it. Both files should contain the same number of peaks, whose original peak sizes should also be the same.
- do 50nt in, 50 nt out from center peak

In [None]:
phastcon = '/projects/ps-yeolab/genomes/hg19/hg19.100way.phastCons.bw'

def create_matrix(twobed, phastcon):
    annotation = {twobed:'twobeds'}
    output_filename = '/home/bay001/projects/encode/analysis/tests/tmp/test.txt'
    phast = ReadDensity.Phastcon(phastcon)
    m = Map.PhastconMap(
        phastcon=phast,
        annotation=annotation,
        output_filename=output_filename,
        upstream_offset=50,downstream_offset=50,
        min_density_threshold=0,
    )

    m.create_matrices()
    return normalization_functions.clean(m.raw_matrices['phastcon'][twobed])    

def plot_matrix_and_mean(real, random, output_file):
    """
    Takes two 'twobed' files, creates the matrices of phastcon scores, and plots the mean
    """
    real_matrix = create_matrix(real, phastcon)
    random_matrix = create_matrix(random, phastcon)
    f, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(20,20))
    if real_matrix.shape[0] != random_matrix.shape[0]:  # we should have the same number of 'peaks' 
        print("warning: event nums don't match: {}, {}".format(
            real, random
        ))
    elif real_matrix.shape[0] < 100:
        return 0
    else:
        real_matrix.to_csv(output_file.replace('.png','.real.txt'), sep='\t')
        random_matrix.to_csv(output_file.replace('.png','.random.txt'), sep='\t')
        
        ax1.plot(real_matrix.mean(), label=os.path.basename(real).replace('.twobed',''), color='blue')
        ax1.plot(random_matrix.mean(), label=os.path.basename(random).replace('.twobed',''), color='red')

        sns.heatmap(real_matrix, yticklabels=False, xticklabels=False, ax=ax2)
        ax2.set_title('real matrix heatmap')
        sns.heatmap(random_matrix, yticklabels=False, xticklabels=False, ax=ax3)
        ax3.set_title('rand matrix heatmap')
        ax1.legend(loc=1)
        f.savefig(output_file)

In [None]:
output_dir = '/home/bay001/projects/encode/analysis/conservation_analysis/conservation_plots/7-3-17'
err = 0  # number of times we didn't find both real and random paired center twobed files

for region in ['CDS','3UTR','5UTR','intron']:
    all_region_beds = glob.glob(os.path.join(center_dir, "*.{}.*twobed".format(region)))
    all_region_uids = list(set([os.path.basename(bed).split('.')[0] for bed in all_region_beds]))
    progress = tnrange(len(all_region_uids), desc=region, leave=True)
    for uid in all_region_uids:
        real_and_rand_pair = glob.glob(os.path.join(center_dir, "{}*.{}.*twobed".format(uid, region)))
        if len(real_and_rand_pair) == 2: # found both real and random center twobed files
            # some kind of ordering step.
            random = ''
            real = ''
            for p in real_and_rand_pair:
                if 'RAND.center.bed' in p:
                    random = p
                else:
                    real = p
            output_file = os.path.join(output_dir, os.path.basename(real).replace('.twobed','.png'))
            # print("plotting: [{}], [{}]".format(real, random))
            if not os.path.exists(output_file):
                plot_matrix_and_mean(real, random, output_file)
        else:
            err += 1
        progress.update(1)
print(err)

In [None]:
# warning: event nums don't match: /home/bay001/projects/encode/analysis/conservation_analysis/idr_peak_centers/350.01v02.IDR.out.0102merged.center.bed.annotated.intron.twobed, /home/bay001/projects/encode/analysis/conservation_analysis/idr_peak_centers/350.01v02.IDR.out.0102merged.bed.annotated.intron.RAND.center.bed.annotated.twobed
