# reads bed file outputs containing edits and converts to bedgraph based on various conf levels.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import numpy as np
import glob
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups/combined_outputs_w_cov_info/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups/bedgraphs/'
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/temporary_data/07_scRNA_groups/sailor_outputs_groups/tmp'

In [3]:
all_bed_files = sorted(glob.glob(os.path.join(input_dir, '*.fx.bed')))
print(len(all_bed_files))
all_bed_files

45


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups/combined_outputs_w_cov_info/APOBEC-STAMP_possorted_genome_bam-APOBEC_STAMP_Apo_filtered_lenti_common_expression_barcodes.txt.fx.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups/combined_outputs_w_cov_info/APOBEC-STAMP_possorted_genome_bam_MD-1_barcodes.tsv.fx.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups/combined_outputs_w_cov_info/APOBEC-STAMP_possorted_genome_bam_MD-2_barcodes.tsv.fx.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups/combined_outputs_w_cov_info/APOBEC-STAMP_possorted_genome_bam_MD-5_barcodes.tsv.fx.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups/combined_outputs_w_cov_info/APOBEC-STAMP_possorted_genome_bam_MD.fx.bed',
 '/home/bay001/projects/kris_a

In [4]:
def recompute_edit_fraction(row):
    edit, cov = row['frac'].split(',')
    return int(edit)/float(cov)

def filter_bed_and_convert_to_bg(input_bed, output_bg, strand, conf):
    edit_head = ['chrom','start','end','conf','frac','strand']
    df = pd.read_table(input_bed, names=edit_head)
    df.sort_values(by=['chrom','start','end'], inplace=True)
    df = df[(df['conf']>=conf) & (df['strand']==strand)]
    df['name'] = df.apply(recompute_edit_fraction, axis=1)
    df[['chrom','start','end','name']].to_csv(output_bg, sep='\t', header=False, index=False)

confs = [0.5, 0.9, 0.99, 0.999, 1]      
genome = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.chrom.sizes'
progress = tnrange(len(all_bed_files)*len(confs)*2)
for conf in confs:
    for bed in all_bed_files:
        for strand, label in zip(['+', '-'],['pos','neg']):
            filter_fn = os.path.join(tmp_dir, os.path.basename(bed) + ".{}".format(conf))
            output_bg = os.path.join(output_dir, os.path.basename(bed) + ".{}.{}.bedgraph".format(conf, label))
            filter_bed_and_convert_to_bg(bed, output_bg, strand, conf)
            progress.update(1)

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

  import sys
