# Takes in STAMP "peaks" (windows_51.txt in the paper) and generates the motif analysis commands. Also performs two filters (using an escore cutoff as well as a re-calculation of the confidence score across the entire window) and performs motif calling based on these filtered lists.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import pandas as pd
import numpy as np
import os
import pybedtools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

from scipy.special import betainc

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/window_outputs/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_2/outputs/motifs/'
bash_script_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/bash_scripts/'

In [3]:
genome_fasta = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
editc_threshold = 0.0
e = 0.001
conf_threshold = 0.99

cmds = []

In [4]:
txt_file = os.path.join(
    input_dir, 
    'SLBP_1000_72hr_merged_a0_b0_e0.01.annotated.windows_51.txt'
)
output_editc_filtered_bed = os.path.join(
    output_dir, 
    '{}.editc_{}.bed'.format(os.path.splitext(os.path.basename(txt_file))[0], editc_threshold)
)
output_conf_filtered_bed = os.path.join(
    output_dir, 
    '{}.e{}_conf{}.bed'.format(os.path.splitext(os.path.basename(txt_file))[0], e, conf_threshold)
)

In [5]:
assert os.path.exists(txt_file)

! head $txt_file

chrom	start	end	name	score	strand	edit_coverage	editable_coverage	edited_over_edited_c	all_c_coverage	edited_over_all_c
chr1	14480	14531	ENSG00000227232.4|noncoding_exon	0.739700373	-	9	102	0.08823529411764706	452	0.01991150442477876
chr1	14487	14538	ENSG00000227232.4|noncoding_exon	0.9997464509999999	-	9	102	0.08823529411764706	504	0.017857142857142856
chr1	14496	14547	ENSG00000227232.4|noncoding_exon	0.999437382	-	9	102	0.08823529411764706	581	0.01549053356282272
chr1	17690	17741	ENSG00000227232.4|noncoding_exon	0.99999999	-	4	5	0.8	42	0.09523809523809523
chr1	18539	18590	ENSG00000227232.4|proxnoncoding_intron500	0.851457771	-	1	17	0.058823529411764705	306	0.0032679738562091504
chr1	18617	18668	ENSG00000227232.4|proxnoncoding_intron500	0.999734695	-	3	14	0.21428571428571427	147	0.02040816326530612
chr1	20098	20149	ENSG00000227232.4|distnoncoding_intron500	0.80163059	-	5	44	0.11363636363636363	372	0.013440860215053764
chr1	20118	20169	ENSG00000227232.4|distnoncoding_intron500	0.999957

# Filter on some value based on editC cutoff from plot.

In [6]:
def write_bedfile_from_txt_file(txt_file, editc_threshold, output_bed):
    
    try:
        df = pd.read_csv(txt_file, sep='\t')
        orig_n = df.shape[0]

        df = df[df['edited_over_all_c']>=editc_threshold]
        new_n = df.shape[0]
        print("Filtering sites: {} -> {}".format(orig_n, new_n))
        
        bedtool = pybedtools.BedTool.from_dataframe(df[['chrom','start','end','name','edited_over_all_c','strand']])
        bedtool = bedtool.sort().merge(s=True, c=(4,5,6), o=('collapse','mean','distinct'))
        bedtool.saveas(output_bed)

        return 0
    except Exception as e:
        print(e)
        return 1

write_bedfile_from_txt_file(txt_file, editc_threshold, output_editc_filtered_bed)  # 28189 for RBFOX2-1000

Filtering sites: 95030 -> 95030


0

In [7]:
def write_motif_analysis_cmd(bed_file, genome_fasta=genome_fasta):
    
    out_homer_dir = os.path.join(
        output_dir, 
        os.path.splitext(os.path.basename(bed_file))[0]
    )
    out_file = out_homer_dir + '.svg'
    out_pickle_file = out_homer_dir + '.pickle'
    if not os.path.exists(out_file):
        cmd = 'module load eclipanalysis/0.0.3a;analyze_motifs '
        cmd += '--peaks {} '.format(bed_file)
        cmd += '--out_pickle_file {} '.format(out_pickle_file)
        cmd += '--out_file {} '.format(out_file)
        cmd += '--species hg19 '
        cmd += '--out_homer_dir {} '.format(out_homer_dir)
        cmd += '--genome_fasta {} '.format(genome_fasta)
        return cmd
    else: 
        return None
    
cmd = write_motif_analysis_cmd(output_editc_filtered_bed)
if cmd is not None:
    cmds.append(cmd)

# What if we try filtering based on conf values? We can grab all txt files now if we want.

In [8]:
all_txt_files = sorted(glob.glob(os.path.join(input_dir, 'SRSF*.annotated.windows_51.txt')))
print(len(all_txt_files))
all_txt_files[:3]

5


['/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/window_outputs/SRSF1_13_S21_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/window_outputs/SRSF1_14_S22_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/window_outputs/SRSF1_15_S23_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt']

In [9]:
def compute_confidence(row, cov_margin):
    """
    Taken from rank_edits.py from SAILOR
    """
    # Default, don't use pseudocounts for now.
    alfa=0
    beta=0
    g = row['edit_coverage']
    a = row['all_c_coverage']
    
    # calc smoothed counts and confidence
    G = g + alfa
    A = a + beta
    try:
        theta = G / float(G + A)
    except ZeroDivisionError:
        return 0
    ########  MOST IMPORTANT LINE  ########
    # calculates the confidence of theta as
    # P( theta < cov_margin | A, G) ~ Beta_theta(G, A)
    confidence = 1 - betainc(G, A, cov_margin)
    return confidence


def read_and_filter_conf(rep_file, cov_margin, conf, output_file):
    """
    Reads and computes conf across a window. Saves and returns both the
    unfiltered and filtered dataframe for downstream analysis. 
    Saves the filtered output to output_file.
    """
    rep = pd.read_csv(rep_file, sep='\t')
    plot_editc_distribution(
        rep=rep,
        output_svg=os.path.splitext(output_file)[0] + '.editc_dist.svg'
    )
    rep['conf'] = rep.apply(compute_confidence, args=(cov_margin,), axis=1)
    rep['conf'].fillna(0, inplace=True)
    
    rep_filtered = rep[rep['conf'] >= conf]
    # rep_filtered['log2_editc'] = np.log2(rep_filtered['edited_over_all_c'])
    # rep_filtered[['chrom','start','end','name','log2_editc','strand']].to_csv(output_file, sep='\t', index=False, header=False)
    bedtool = pybedtools.BedTool.from_dataframe(
        rep_filtered[['chrom','start','end','name','edited_over_all_c','strand']]
    )
    bedtool = bedtool.sort().merge(s=True, c=(4,5,6), o=('collapse','mean','distinct'))
    bedtool.saveas(output_file)
    
    # Plot the distribution of "peak" lengths
    _filtered = bedtool.to_dataframe()
    _filtered['length'] = _filtered['end'] - _filtered['start']
    plot_length_distribution(
        filtered_list=_filtered, 
        output_svg=os.path.splitext(output_file)[0] + '.peak_lengths.svg'
    )
    
    print("Filtered {} {} -> {}".format(os.path.basename(rep_file), rep.shape[0], rep_filtered.shape[0]))
    
    return rep, rep_filtered

def plot_editc_distribution(rep, output_svg):
    fig, ax = plt.subplots()
    sns.distplot(rep['edited_over_all_c'], bins=50, ax=ax)
    fig.savefig(output_svg)
    plt.close()
    
def plot_length_distribution(filtered_list, output_svg):
    fig, ax = plt.subplots()
    sns.distplot(filtered_list['length'], bins=50, ax=ax)
    fig.savefig(output_svg)
    plt.close()
    

In [10]:
rep, filtered = read_and_filter_conf(
    txt_file, 
    cov_margin=e, 
    conf=conf_threshold, 
    output_file=output_conf_filtered_bed
)

! head $output_conf_filtered_bed

Filtered SLBP_1000_72hr_merged_a0_b0_e0.01.annotated.windows_51.txt 95030 -> 4499
chr1	14480	14547	ENSG00000227232.4|noncoding_exon,ENSG00000227232.4|noncoding_exon,ENSG00000227232.4|noncoding_exon	0.01775306028	-
chr1	17690	17741	ENSG00000227232.4|noncoding_exon	0.09523809524	-
chr1	18617	18668	ENSG00000227232.4|proxnoncoding_intron500	0.02040816327	-
chr1	20098	20169	ENSG00000227232.4|distnoncoding_intron500,ENSG00000227232.4|distnoncoding_intron500	0.01384293723	-
chr1	146460	146513	ENSG00000241860.2|noncoding_exon,ENSG00000241860.2|noncoding_exon	0.01914971596	-
chr1	158559	158610	ENSG00000241860.2|distnoncoding_intron500	0.07317073171	-
chr1	327477	327544	ENSG00000250575.1|noncoding_exon,ENSG00000250575.1|noncoding_exon	0.022062657	+
chr1	564520	564594	ENSG00000225972.1|noncoding_exon,ENSG00000225972.1|noncoding_exon,ENSG00000225972.1|noncoding_exon	0.02541942078	+
chr1	567165	567220	ENSG00000237973.1|noncoding_exon,ENSG00000237973.1|noncoding_exon	0.01961451247	+
chr1	569848	5698

In [11]:
for txt_file in all_txt_files:
    output_conf_filtered_bed = os.path.join(
        output_dir, 
        '{}.e{}_conf{}.bed'.format(os.path.splitext(os.path.basename(txt_file))[0], e, conf_threshold)
    )
    rep, filtered = read_and_filter_conf(
        txt_file, 
        cov_margin=e, 
        conf=conf_threshold, 
        output_file=output_conf_filtered_bed
    )
    cmd = write_motif_analysis_cmd(bed_file=output_conf_filtered_bed)
    if cmd is not None:
        cmds.append(cmd)
        
# RBFOX2-50 172150 -> 76889

Filtered SRSF1_13_S21_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt 15753 -> 1294
Filtered SRSF1_14_S22_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt 16524 -> 1586
Filtered SRSF1_15_S23_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt 25494 -> 4363
Filtered SRSF1_16_S24_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt 23024 -> 4026
Filtered SRSF1_S43_merged.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.annotated.windows_51.txt 21034 -> 1994


In [12]:
Submitter(
    cmds, 
    'motif_analysis', 
    sh=os.path.join(bash_script_dir, 'motif_analysis.sh'), 
    nodes=1, 
    ppn=1, 
    array=True, 
    submit=True, 
    walltime='40:00:00'
)

Writing 6 tasks as an array-job.
Wrote commands to /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/bash_scripts/motif_analysis.sh.
Submitted script to queue home.
 Job ID: 21705797


<qtools.submitter.Submitter at 0x2ad932316d50>

In [13]:
cmds

['module load eclipanalysis/0.0.3a;analyze_motifs --peaks /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_2/outputs/motifs/SLBP_1000_72hr_merged_a0_b0_e0.01.annotated.windows_51.editc_0.0.bed --out_pickle_file /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_2/outputs/motifs/SLBP_1000_72hr_merged_a0_b0_e0.01.annotated.windows_51.editc_0.0.pickle --out_file /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_2/outputs/motifs/SLBP_1000_72hr_merged_a0_b0_e0.01.annotated.windows_51.editc_0.0.svg --species hg19 --out_homer_dir /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_2/outputs/motifs/SLBP_1000_72hr_merged_a0_b0_e0.01.annotated.windows_51.editc_0.0 --genome_fasta /projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa ',
 'module load eclipanalysis/0.0.3a;analyze_motifs --peaks /home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/figure_2/outputs/