# Re-formats the native SAILOR outputs to report reads edited per site and total coverage per site.
- We'll use the 'reads edited per site' 

In [1]:
import glob
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/combined_outputs_w_cov_info'

### Here I'm grabbing the fwd BEDfile final outputs from SAILOR, and using these files to find its corresponding rev BEDfile final outputs. Merge them together.

- I'm also using the total coverage and the edit fraction to re-compute the edited coverage at the position. **This is critical for downstream analysis!**
- eg. 15|C>T|0.066666667 -> 1,15 # where 1 is the number of reads that were C>T converted and 15 is the total coverage at that edit site.
- **Yes I know the rand0 bed files error out, it is because 0-barcodes have no edits**

In [4]:
fwd = sorted(glob.glob(os.path.join(input_dir, '*rand*trial*/results/*.fwd.*.bed')))

print(len(fwd))
fwd[:10]

590


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-0.txt/results/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-0.txt.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-1.txt/results/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-1.txt.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-2.txt/results/RBFOX2-TIA1-STAMP_round2E

In [5]:
def filter_bed(df, conf_level):
    """
    Filters the bed file given a conf level. We don't really use this score anymore since we're now scoring by edit fraction.
    """
    return df[df['conf'] >= conf_level]

def get_rev(fwd_file):
    """
    (fwd) and (rev) bed files are identically named except for the 'fwd' and 'rev' annotation.
    So it's easy to, given the fwd_file name, return the corresponding rev_file 
    """
    return fwd_file.replace(
        '.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
        '.rev.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed'
    )

def get_combined_bedfile_name(fwd_file):
    """
    Really just re-formats the name to be shorter and remove the 'fwd' annotation. 
    We'll be using the returned string as the new 'merged' name
    """
    return fwd_file.replace('.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed','.bed')

def concat_bedfiles(fwd_file, rev_file):
    """
    Combines two native SAILOR fwd and rev bedfiles
    """
    common_header = ['chrom','start','end','info','conf','strand']
    fwd = pd.read_csv(fwd_file, sep='\t', names=common_header)
    rev = pd.read_csv(rev_file, sep='\t', names=common_header)
    return pd.concat([fwd, rev])

def get_number_edited_reads(row):
    """
    SAILOR reports the total coverage and edit fraction in the 'info' column.
    Use these two numbers to get the number of edited reads.
    """
    total_reads, edit_type, fraction = row['info'].split('|')
    return round(int(total_reads) * float(fraction))

def label_cov_info(row):
    """
    returns the num_edited and total_coverage as a concatenated string.
    """
    return "{},{}".format(row['num_edited'], row['total_coverage'])

progress = tnrange(len(fwd))
for f in fwd:
    try:
        output_file = os.path.join(
            output_dir, 
            os.path.basename(get_combined_bedfile_name(f))
        )
        if not os.path.exists(output_file):
            
            df = concat_bedfiles(f, get_rev(f))
            df['total_coverage'] = df['info'].apply(lambda x: int(x.split('|')[0]))
            df['num_edited'] = df.apply(get_number_edited_reads, axis=1)
            df['name_col'] = df.apply(label_cov_info, axis=1)
            df[['chrom','start','end','conf','name_col','strand']].to_csv(
                output_file, 
                sep='\t', 
                index=False, 
                header=False
            )
            
    except ValueError as e:
        print(f, e)
    progress.update(1)

HBox(children=(IntProgress(value=0, max=590), HTML(value='')))

/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-0.txt/results/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-0.txt.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed Cannot set a frame with no defined index and a value that cannot be converted to a Series
/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/sailor_outputs_groups_deep/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-1.txt/results/RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD-34_barcodes.rand0.trial-1.txt.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed Cannot set a frame with no defined index and a value that cannot be converted to a Series
/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_

# Merge the (readfiltered) BAM files
### Here I'm grabbing the SAILOR intermediate outputs ```*.readfiltered.bam``` which are, for fwd and rev strands, the BAM files post read filtering.

In [None]:
fwd_bam_files = sorted(glob.glob(os.path.join(input_dir, '*/results/*.fwd.*readfiltered.bam')))
print(len(fwd_bam_files))
fwd_bam_files[:3]

In [None]:
cmds = []
for fwd in fwd_bam_files:
    
    rev = fwd.replace(
        '.fwd.sorted.rmdup.readfiltered.bam',
        '.rev.sorted.rmdup.readfiltered.bam'
    )
    merged = os.path.join(
        output_dir, os.path.basename(fwd).replace(
            '.fwd.sorted.rmdup.readfiltered.bam',
            '.merged.sorted.rmdup.readfiltered.bam'
        )
    )
    merged_sorted = os.path.join(
        output_dir, os.path.basename(fwd).replace(
            '.fwd.sorted.rmdup.readfiltered.bam',
            '.merged.sorted.rmdup.readfiltered.sorted.bam'
        )
    )
    if not os.path.exists(merged_sorted):
        assert os.path.exists(rev)
        cmd = 'module load samtools;samtools merge -f {} {} {};'.format(
            merged,
            fwd,
            rev
        )
        cmd += 'samtools sort {} > {};'.format(merged, merged_sorted)
        cmd += 'samtools index {}'.format(merged_sorted)
        cmds.append(cmd)
        
len(cmds)

In [None]:
progress = tnrange(len(cmds))
for cmd in cmds:
    ! $cmd
    progress.update(1)