In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import pandas as pd
import numpy as np
import os
import pybedtools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

from scipy.special import betainc

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/window_outputs_deep'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/motifs_deep'
bash_script_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bash_scripts'

In [3]:
genome_fasta = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.fa'
editc_threshold = 0.0
e = 0.001
conf_threshold = 0.99

cmds = []

# Filter on some value based on editC cutoff from plot. *unused code*
- Let's instead use a more uniform way of choosing, how about we re-calculate the "conf" scores based on the window instead of per site?

In [4]:
def write_bedfile_from_txt_file(txt_file, editc_threshold, output_bed):
    """
    
    """
    try:
        df = pd.read_csv(txt_file, sep='\t')
        orig_n = df.shape[0]

        df = df[df['edited_over_all_c']>=editc_threshold]
        new_n = df.shape[0]
        print("Filtering sites: {} -> {}".format(orig_n, new_n))
        
        bedtool = pybedtools.BedTool.from_dataframe(df[['chrom','start','end','name','edited_over_all_c','strand']])
        bedtool = bedtool.sort().merge(s=True, c=(4,5,6), o=('collapse','mean','distinct'))
        bedtool.saveas(output_bed)

        return 0
    except Exception as e:
        print(e)
        return 1

# write_bedfile_from_txt_file(txt_file, editc_threshold, output_editc_filtered_bed)  # 28189 for RBFOX2-1000

In [5]:
def write_motif_analysis_cmd(bed_file, genome_fasta=genome_fasta):
    """
    If the output from analyze_motifs doesn't exist, create the commandline 
    needed to run de-novo motif finding using the eclipanalysis module.
    """
    out_homer_dir = os.path.join(
        output_dir, 
        os.path.splitext(os.path.basename(bed_file))[0]
    )
    out_file = out_homer_dir + '.svg'
    out_pickle_file = out_homer_dir + '.pickle'
    if not os.path.exists(out_file):
        cmd = 'module load eclipanalysis/0.0.3a;analyze_motifs '
        cmd += '--peaks {} '.format(bed_file)
        cmd += '--out_pickle_file {} '.format(out_pickle_file)
        cmd += '--out_file {} '.format(out_file)
        cmd += '--species hg19 '
        cmd += '--out_homer_dir {} '.format(out_homer_dir)
        cmd += '--genome_fasta {} '.format(genome_fasta)
        return cmd
    else: 
        return None
    
## Testing the function
# cmd = write_motif_analysis_cmd(output_editc_filtered_bed)
# if cmd is not None:
#     cmds.append(cmd)

# What if we try filtering based on conf values? We can grab all txt files now if we want.

In [6]:
all_txt_files = sorted(glob.glob(os.path.join(input_dir, '*.txt')))
print(len(all_txt_files))
all_txt_files[:10]

634


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/window_outputs_deep/APOBEC-STAMP_possorted_genome_bam-APOBEC_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.fx.annotated.windows_51.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/window_outputs_deep/APOBEC-STAMP_possorted_genome_bam_MD-1_barcodes.tsv.fx.annotated.windows_51.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/window_outputs_deep/APOBEC-STAMP_possorted_genome_bam_MD-2_barcodes.tsv.fx.annotated.windows_51.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/window_outputs_deep/APOBEC-STAMP_possorted_genome_bam_MD-5_barcodes.tsv.fx.annotated.windows_51.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/window_outputs_deep/APOBEC-STAMP_possorted_genome_bam_MD-APOBEC_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.fx.annotated.windows_51.txt',
 '/home/bay0

In [7]:
def compute_confidence(row, cov_margin):
    """
    Taken from rank_edits.py from SAILOR
    """
    # Default, don't use pseudocounts for now.
    alfa=0
    beta=0
    g = row['edit_coverage']
    a = row['all_c_coverage']
    
    # calc smoothed counts and confidence
    G = g + alfa
    A = a + beta
    try:
        theta = G / float(G + A)
    except ZeroDivisionError:
        return 0
    ########  MOST IMPORTANT LINE  ########
    # calculates the confidence of theta as
    # P( theta < cov_margin | A, G) ~ Beta_theta(G, A)
    confidence = 1 - betainc(G, A, cov_margin)
    return confidence

def read_and_filter_conf(rep_file, cov_margin, conf, output_file):
    """
    Reads and computes conf across a window. Saves and returns both the
    unfiltered and filtered dataframe for downstream analysis. 
    Saves the filtered output to output_file.
    """
    rep = pd.read_csv(rep_file, sep='\t')
    plot_editc_distribution(
        rep=rep,
        output_svg=os.path.splitext(output_file)[0] + '.editc_dist.svg'
    )
    rep['conf'] = rep.apply(compute_confidence, args=(cov_margin,), axis=1)
    rep['conf'].fillna(0, inplace=True)
    
    rep_filtered = rep[rep['conf'] >= conf]
    # rep_filtered['log2_editc'] = np.log2(rep_filtered['edited_over_all_c'])
    # rep_filtered[['chrom','start','end','name','log2_editc','strand']].to_csv(output_file, sep='\t', index=False, header=False)
    bedtool = pybedtools.BedTool.from_dataframe(
        rep_filtered[['chrom','start','end','name','edited_over_all_c','strand']]
    )
    bedtool = bedtool.sort().merge(s=True, c=(4,5,6), o=('collapse','mean','distinct'))
    bedtool.saveas(output_file)
    
    # Plot the distribution of "peak" lengths
    _filtered = bedtool.to_dataframe()
    _filtered['length'] = _filtered['end'] - _filtered['start']
    plot_length_distribution(
        filtered_list=_filtered, 
        output_svg=os.path.splitext(output_file)[0] + '.peak_lengths.svg'
    )
    
    print("Filtered {} {} -> {}".format(os.path.basename(rep_file), rep.shape[0], rep_filtered.shape[0]))
    
    return rep, rep_filtered

def plot_editc_distribution(rep, output_svg):
    fig, ax = plt.subplots()
    sns.distplot(rep['edited_over_all_c'], bins=50, ax=ax)
    fig.savefig(output_svg)
    plt.close()
    
def plot_length_distribution(filtered_list, output_svg):
    fig, ax = plt.subplots()
    sns.distplot(filtered_list['length'], bins=50, ax=ax)
    fig.savefig(output_svg)
    plt.close()
    

In [8]:
for txt_file in all_txt_files:
    output_conf_filtered_bed = os.path.join(
        output_dir, 
        '{}.e{}_conf{}.bed'.format(os.path.splitext(os.path.basename(txt_file))[0], e, conf_threshold)
    )
    if not os.path.exists(output_conf_filtered_bed):
        rep, filtered = read_and_filter_conf(
            txt_file, 
            cov_margin=e, 
            conf=conf_threshold, 
            output_file=output_conf_filtered_bed
        )
    cmd = write_motif_analysis_cmd(bed_file=output_conf_filtered_bed)
    if cmd is not None:
        cmds.append(cmd)
        
# RBFOX2-50 172150 -> 76889

Filtered For_Motif_Background_APO_plus_RBFOX2-TIA1_APO_edits_barcodes_4_5.fx.annotated.windows_51.txt 544061 -> 43470
Filtered RPS2-STAMP_possorted_genome_bam-RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.fx.annotated.windows_51.txt 432453 -> 188023
Filtered RPS2-STAMP_possorted_genome_bam_MD-RPS2_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.fx.annotated.windows_51.txt 403829 -> 169371


In [9]:
len(cmds)

3

In [10]:
Submitter(
    cmds, 
    'motif_analysis', 
    sh=os.path.join(bash_script_dir, 'motif_analysis.sh'), 
    nodes=1, 
    ppn=1, 
    array=True, 
    submit=True, 
    walltime='40:00:00'
)

Writing 3 tasks as an array-job.
Wrote commands to /home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/bash_scripts/motif_analysis.sh.
Submitted script to queue home.
 Job ID: 22083755


<qtools.submitter.Submitter at 0x2b8fecdc00d0>

In [None]:
len(cmds)