# Plots the confidence scores with respect to depth. Also re-formats the SAILOR outputs into a BED format with the columns: 

'chrom','start','end','conf','edit_frac','strand'

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import numpy as np
import glob
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/bedgraphs'
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/tmp/tmp_bedgraph_files'

In [3]:
all_bed_files = sorted(glob.glob(os.path.join(input_dir, '*e0.01.bed')))
print(len(all_bed_files))
all_bed_files

130


['/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/ABCE1-0_S1_L001_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/ABCE1-1000_S15_L002_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/ABCE1-50_S8_L001_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/APOBEC_only_0_merged_R1.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.bed',
 '/home/bay001/projects/kris_apobec_20200121/perma

In [5]:
def recompute_edit_fraction(row):
    edit, cov = row['frac'].split(',')
    return int(edit)/float(cov)

def filter_bed_and_convert_to_bg(input_bed, output_bg, strand, conf):
    edit_head = ['chrom','start','end','conf','frac','strand']
    df = pd.read_table(input_bed, names=edit_head)
    df.sort_values(by=['chrom','start','end'], inplace=True)
    df = df[(df['conf']>=conf) & (df['strand']==strand)]
    df['name'] = df.apply(recompute_edit_fraction, axis=1)
    df[['chrom','start','end','name']].to_csv(output_bg, sep='\t', header=False, index=False)

confs = [0.5, 0.9, 0.99, 0.999, 1]      
genome = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.chrom.sizes'
progress = tnrange(len(all_bed_files)*len(confs)*2)
for conf in confs:
    for bed in all_bed_files:
        try:
            for strand, label in zip(['+', '-'],['pos','neg']):
                filter_fn = os.path.join(tmp_dir, os.path.basename(bed) + ".{}".format(conf))
                output_bg = os.path.join(output_dir, os.path.basename(bed) + ".{}.{}.bedgraph".format(conf, label))
                if not os.path.exists(output_bg):
                    filter_bed_and_convert_to_bg(bed, output_bg, strand, conf)
                progress.update(1)
        except ValueError:
            print(bed)

HBox(children=(IntProgress(value=0, max=1300), HTML(value='')))

  import sys


/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/RBFOX2-T_1_S32_L003_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.bed
/home/bay001/projects/kris_apobec_20200121/permanent_data/final_analysis/01_SAILOR_bulk_rnaseq/outputs/combined_outputs_w_cov_info/RBFOX2-T_1_S32_L003_R1_001.fastqTr.sorted.STARUnmapped.out.sorted.STARAligned.out.sorted_a0_b0_e0.01.bed
