# SAILOR produces results split by strand, so let's join them here.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir_prefix = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes'
bed_output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_merged_bedfiles/'

In [3]:
def filter_bed(df, conf_level):
    """
    Filters the bed file given a conf level. We don't really use this score anymore since we're now scoring by edit fraction.
    """
    return df[df['conf'] >= conf_level]

def get_rev(fwd_file):
    """
    (fwd) and (rev) bed files are identically named except for the 'fwd' and 'rev' annotation.
    So it's easy to, given the fwd_file name, return the corresponding rev_file 
    """
    return fwd_file.replace(
        '.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
        '.rev.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed'
    )

def get_combined_bedfile_name(fwd_file):
    """
    Really just re-formats the name to be shorter and remove the 'fwd' annotation. 
    We'll be using the returned string as the new 'merged' name
    """
    return fwd_file.replace('.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed','.bed')

def concat_bedfiles(fwd_file, rev_file):
    """
    Combines two native SAILOR fwd and rev bedfiles
    """
    common_header = ['chrom','start','end','info','conf','strand']
    fwd = pd.read_csv(fwd_file, sep='\t', names=common_header)
    rev = pd.read_csv(rev_file, sep='\t', names=common_header)
    return pd.concat([fwd, rev])

def get_number_edited_reads(row):
    """
    SAILOR reports the total coverage and edit fraction in the 'info' column.
    Use these two numbers to get the number of edited reads.
    """
    total_reads, edit_type, fraction = row['info'].split('|')
    return round(int(total_reads) * float(fraction))

def label_cov_info(row):
    """
    returns the num_edited and total_coverage as a concatenated string.
    """
    return "{},{}".format(row['num_edited'], row['total_coverage'])

In [4]:
errors = set()
all_final_outputs = 0   # all final outputs. if this doesn't match the above, we need to check which files have not been created.
no_edit_barcodes = []
progress = tnrange(16)
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_output = os.path.join(input_dir_prefix + '_{}{}'.format(nt, nt2))
        all_fwd_files = sorted(glob.glob(os.path.join(split_output, "*/results/*.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed")))
        progress_inner = tnrange(len(all_fwd_files), leave=False)
        for f in all_fwd_files:
            try:
                output_file = os.path.join(
                    bed_output_dir, 
                    os.path.basename(get_combined_bedfile_name(f))
                )
                if not os.path.exists(output_file):
                    df = concat_bedfiles(f, get_rev(f))
                    if df.shape[0] > 0:
                        df['total_coverage'] = df['info'].apply(lambda x: int(x.split('|')[0]))
                        df['num_edited'] = df.apply(get_number_edited_reads, axis=1)
                        df['name_col'] = df.apply(label_cov_info, axis=1)
                        df[['chrom','start','end','conf','name_col','strand']].to_csv(
                            output_file, 
                            sep='\t', 
                            index=False, 
                            header=False
                        )
                    else:
                        no_edit_barcodes.append(f)
            except Exception:
                print(e)
                errors.add(f)
            all_final_outputs += 1
            progress_inner.update(1)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1092), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1166), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1489), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1280), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1551), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1025), HTML(value='')))

HBox(children=(IntProgress(value=0, max=876), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1448), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1384), HTML(value='')))

HBox(children=(IntProgress(value=0, max=931), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1037), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1532), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1022), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1910), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1482), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1392), HTML(value='')))

In [5]:
errors

set()

In [6]:
all_final_outputs

20617

In [7]:
no_edit_barcodes[:3]

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA/possorted_genome_bam_MD-AAACCCAAGAAGCGAA-1/results/possorted_genome_bam_MD-AAACCCAAGAAGCGAA-1.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA/possorted_genome_bam_MD-AAACCCAAGAGCATTA-1/results/possorted_genome_bam_MD-AAACCCAAGAGCATTA-1.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/04_scRNA_RBFOX2_TIA/sailor_outputs_individual_barcodes_AA/possorted_genome_bam_MD-AAACCCAAGCTATCTG-1/results/possorted_genome_bam_MD-AAACCCAAGCTATCTG-1.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed']