# SAILOR produces results split by strand, so let's join them here.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir_prefix = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes'
bed_output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedfiles/'
bedgraph_output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bedgraphs/'
bigwig_output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_bigwigs/'

In [3]:
def filter_bed(df, conf_level):
    """
    Filters the bed file given a conf level. We don't really use this score anymore since we're now scoring by edit fraction.
    """
    return df[df['conf'] >= conf_level]

def get_rev(fwd_file):
    """
    (fwd) and (rev) bed files are identically named except for the 'fwd' and 'rev' annotation.
    So it's easy to, given the fwd_file name, return the corresponding rev_file 
    """
    return fwd_file.replace(
        '.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
        '.rev.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed'
    )

def get_combined_bedfile_name(fwd_file):
    """
    Really just re-formats the name to be shorter and remove the 'fwd' annotation. 
    We'll be using the returned string as the new 'merged' name
    """
    return fwd_file.replace('.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed','.bed')

def concat_bedfiles(fwd_file, rev_file):
    """
    Combines two native SAILOR fwd and rev bedfiles
    """
    common_header = ['chrom','start','end','info','conf','strand']
    fwd = pd.read_csv(fwd_file, sep='\t', names=common_header)
    rev = pd.read_csv(rev_file, sep='\t', names=common_header)
    return pd.concat([fwd, rev])

def get_number_edited_reads(row):
    """
    SAILOR reports the total coverage and edit fraction in the 'info' column.
    Use these two numbers to get the number of edited reads.
    """
    total_reads, edit_type, fraction = row['info'].split('|')
    return round(int(total_reads) * float(fraction))

def label_cov_info(row):
    """
    returns the num_edited and total_coverage as a concatenated string.
    """
    return "{},{}".format(row['num_edited'], row['total_coverage'])

# Let's break this up and convert according to the barcode prefix.
- Basically cycle through each nt in a nested for loop and append the suffix to our ```input_dir_prefix``` in order to get the correct path per barcode.

In [10]:
errors = set()
all_final_outputs = 0   # all final outputs. if this doesn't match the above, we need to check which files have not been created.

progress = tnrange(16)
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_output = os.path.join(input_dir_prefix + '_{}{}'.format(nt, nt2))
        all_fwd_files = sorted(glob.glob(os.path.join(split_output, "*/results/*.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed"))) # ALL sailor forward bed files have this suffix.
        progress_inner = tnrange(len(all_fwd_files), leave=False)
        for f in all_fwd_files:
            try:
                output_file = os.path.join(
                    bed_output_dir, 
                    os.path.basename(get_combined_bedfile_name(f))
                )
                if not os.path.exists(output_file):
                    df = concat_bedfiles(f, get_rev(f)) # use this function to get the correct path to the reverse
                    if df.shape[0] > 0:
                        df['total_coverage'] = df['info'].apply(lambda x: int(x.split('|')[0]))
                        df['num_edited'] = df.apply(get_number_edited_reads, axis=1)
                        df['name_col'] = df.apply(label_cov_info, axis=1)
                        df[['chrom','start','end','conf','name_col','strand']].to_csv(
                            output_file, 
                            sep='\t', 
                            index=False, 
                            header=False
                        )
            except Exception:
                print(e)
                errors.add(f)
            all_final_outputs += 1
            progress_inner.update(1)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1012), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1080), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1446), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1222), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1496), HTML(value='')))

HBox(children=(IntProgress(value=0, max=974), HTML(value='')))

HBox(children=(IntProgress(value=0, max=928), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1402), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1320), HTML(value='')))

HBox(children=(IntProgress(value=0, max=793), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1128), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1488), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1077), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1689), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1436), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1378), HTML(value='')))

### The above codeblock will try and concat each fwd file with its reverse counterpart. If it fails for whatever reason, it will print the error above and ```errors``` will contain the files it had trouble concatenating.

In [11]:
errors

set()