In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import pandas as pd
import numpy as np
import os
import pybedtools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [6]:
input_dir = '/home/bay001/'
# input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/pass6/'
# output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/pass6/'
output_dir = '/home/bay001/scratch/apobec'
bash_script_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/bash_scripts/'

In [15]:
all_windows = sorted(glob.glob(os.path.join(input_dir, 'peaks_merged*.bed')))
print(len(all_windows))
all_windows[:3]

16


['/home/bay001/peaks_merged_0.0.bed',
 '/home/bay001/peaks_merged_0.00949667616334283.bed',
 '/home/bay001/peaks_merged_0.01899335232668566.bed']

# Read in one of them and check on IGV

In [None]:
def choose_bins(df, n_bins):
    n = int(df.shape[0]*0.005)

    # print("Setting the ranges without the top {} values".format(n))
    min_score = 0
    max_score = df.sort_values('score', ascending=False)[n:]['score'].max()

    ticklabels = [round(c, 4) for c in np.arange(min_score, max_score, max_score/n_bins)]
    # print(ticklabels)
    return list(np.arange(min_score, max_score, max_score/n_bins)), ticklabels

    
def calculate_stuff(
    window_file, 
    conf, 
    n_bins=10,
    score_column='edited_over_all_c', 
):
    
    # Filter and subset our main dataframe.
    df = pd.read_csv(window_file, sep='\t')
    df['region'] = df['name'].apply(lambda x:x.split('|')[1])
    df['geneid'] = df['name'].apply(lambda x:x.split('|')[0])
    df = df[(df['region']=='CDS') | (df['region']=='3utr')]
    df = df[df['score'] > conf]
    df.sort_values(by=['chrom','start','end','strand'], ascending=True, inplace=True)
    dfbt = pybedtools.BedTool.from_dataframe(df[['chrom','start','end','geneid',score_column,'strand']])
    
    # bedtools cluster performs a 'merge' but instead of merging, assigns each merged group with an ID (thickStart).
    # We can ensure non-overlapping windows by sorting by 'score' (highest to lowest) and removing duplicate cluster IDs, keeping the first one only.  
    nonoverlapping = dfbt.cluster().to_dataframe()
    nonoverlapping.sort_values(by=['score'], ascending=False, inplace=True)
    nonoverlapping.drop_duplicates(['thickStart'], keep='first', inplace=True)
    nonoverlapping.sort_values(by=['chrom','start','end','strand'], inplace=True)

    # we no longer need the cluster ID
    nonoverlapping = nonoverlapping[['chrom','start','end','name','score','strand']]
    dfbt = pybedtools.BedTool.from_dataframe(df[['chrom','start','end','geneid',score_column,'strand']])
    
    assert pybedtools.BedTool.from_dataframe(nonoverlapping).sort().merge(s=True, c=(4,5,6), o=('collapse','collapse','distinct')).to_dataframe().shape[0] == nonoverlapping.shape[0]
    
    bins, ticklabels = choose_bins(nonoverlapping, n_bins=n_bins)
    
    nonoverlapping['bin'] = nonoverlapping[['score']].apply(
        lambda x : pd.cut(
            x, 
            ([-1] + bins),
            labels=ticklabels
        )
    )
    nonoverlapping['bin'].fillna(1, inplace=True)
    
    return nonoverlapping

In [None]:
progress = tnrange(len(all_windows))
for windows in all_windows:
    motifs_df = calculate_stuff(windows, conf=0.5, score_column='edited_over_all_c')
    i = 0
    for threshold in sorted(list(set(motifs_df['bin']))):
        output_file = os.path.join(
            output_dir, 
            os.path.splitext(os.path.basename(windows))[0] + ".bin{}.bed".format(i)
        )
        if not os.path.exists(output_file):
            df = motifs_df[motifs_df['bin']>=threshold]
            df.sort_values(by='score', inplace=True, ascending=False)
            df[['chrom','start','end','name','score','strand']].to_csv(
                output_file, 
                sep='\t', 
                header=False, 
                index=False
            )
        i+=1
    progress.update(1)

# Call motifs

In [None]:
all_bedfiles = sorted(glob.glob(os.path.join(output_dir, '*.bed')))
len(all_bedfiles)

In [16]:
genome_fasta = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
cmds = []
for bed in all_windows:
    if not os.path.exists(os.path.join(output_dir, os.path.splitext(os.path.basename(bed))[0])):
        out_homer_dir = os.path.join(output_dir, os.path.splitext(os.path.basename(bed))[0])
        out_file = out_homer_dir + '.svg'
        out_pickle_file = out_homer_dir + '.pickle'
        cmd = 'module load eclipanalysis/0.0.3a;analyze_motifs '
        cmd += '--peaks {} '.format(bed)
        cmd += '--out_pickle_file {} '.format(out_pickle_file)
        cmd += '--out_file {} '.format(out_file)
        cmd += '--species hg19 '
        cmd += '--out_homer_dir {} '.format(out_homer_dir)
        cmd += '--genome_fasta {} '.format(genome_fasta)
        cmds.append(cmd)

In [17]:
cmds

['module load eclipanalysis/0.0.3a;analyze_motifs --peaks /home/bay001/peaks_merged_0.0.bed --out_pickle_file /home/bay001/scratch/apobec/peaks_merged_0.0.pickle --out_file /home/bay001/scratch/apobec/peaks_merged_0.0.svg --species hg19 --out_homer_dir /home/bay001/scratch/apobec/peaks_merged_0.0 --genome_fasta /projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa ',
 'module load eclipanalysis/0.0.3a;analyze_motifs --peaks /home/bay001/peaks_merged_0.00949667616334283.bed --out_pickle_file /home/bay001/scratch/apobec/peaks_merged_0.00949667616334283.pickle --out_file /home/bay001/scratch/apobec/peaks_merged_0.00949667616334283.svg --species hg19 --out_homer_dir /home/bay001/scratch/apobec/peaks_merged_0.00949667616334283 --genome_fasta /projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa ',
 'module load eclipanalysis/0.0.3a;analyze_motifs --peaks /home/bay001/peaks_merged_0.01899335232668566.bed --out_pickle_file /home/bay001/scratch/apobec/peaks_merged_0.01899335232668566.pickle --ou

In [18]:
Submitter(
    cmds, 
    'motif_analysis3', 
    sh=os.path.join(bash_script_dir, 'motif_analysis5.sh'), 
    nodes=1, 
    ppn=1, 
    array=True, 
    submit=True, 
    walltime='40:00:00'
)

Writing 16 tasks as an array-job.
Wrote commands to /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/bash_scripts/motif_analysis5.sh.
Submitted script to queue home.
 Job ID: 21186489


<qtools.submitter.Submitter at 0x2ba84b4d6350>