# SAILOR produces results split by strand, so let's join them here.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/inputs/'
input_dir_prefix = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes'
bed_output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/'

In [3]:
def filter_bed(df, conf_level):
    """
    Filters the bed file given a conf level. We don't really use this score anymore since we're now scoring by edit fraction.
    """
    return df[df['conf'] >= conf_level]

def get_rev(fwd_file):
    """
    (fwd) and (rev) bed files are identically named except for the 'fwd' and 'rev' annotation.
    So it's easy to, given the fwd_file name, return the corresponding rev_file 
    """
    return fwd_file.replace(
        '.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
        '.rev.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed'
    )

def get_combined_bedfile_name(fwd_file):
    """
    Really just re-formats the name to be shorter and remove the 'fwd' annotation. 
    We'll be using the returned string as the new 'merged' name
    """
    return fwd_file.replace('.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed','.bed')

def concat_bedfiles(fwd_file, rev_file):
    """
    Combines two native SAILOR fwd and rev bedfiles
    """
    common_header = ['chrom','start','end','info','conf','strand']
    fwd = pd.read_csv(fwd_file, sep='\t', names=common_header)
    rev = pd.read_csv(rev_file, sep='\t', names=common_header)
    return pd.concat([fwd, rev])

def get_number_edited_reads(row):
    """
    SAILOR reports the total coverage and edit fraction in the 'info' column.
    Use these two numbers to get the number of edited reads.
    """
    total_reads, edit_type, fraction = row['info'].split('|')
    return round(int(total_reads) * float(fraction))

def label_cov_info(row):
    """
    returns the num_edited and total_coverage as a concatenated string.
    """
    return "{},{}".format(row['num_edited'], row['total_coverage'])

In [4]:
errors = set()
all_final_outputs = 0   # all final outputs. if this doesn't match the above, we need to check which files have not been created.
no_edit_barcodes = []
progress = tnrange(16)
for nt in ['A', 'C', 'G', 'T']:
    for nt2 in ['A', 'C', 'G', 'T']:
        split_output = os.path.join(input_dir_prefix + '_{}{}'.format(nt, nt2))
        all_fwd_files = sorted(glob.glob(os.path.join(split_output, "*/results/*.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed")))
        progress_inner = tnrange(len(all_fwd_files), leave=False)
        for f in all_fwd_files:
            try:
                output_file = os.path.join(
                    bed_output_dir, 
                    os.path.basename(get_combined_bedfile_name(f))
                )
                if not os.path.exists(output_file):
                    df = concat_bedfiles(f, get_rev(f))
                    if df.shape[0] > 0:
                        df['total_coverage'] = df['info'].apply(lambda x: int(x.split('|')[0]))
                        df['num_edited'] = df.apply(get_number_edited_reads, axis=1)
                        df['name_col'] = df.apply(label_cov_info, axis=1)
                        df[['chrom','start','end','conf','name_col','strand']].to_csv(
                            output_file, 
                            sep='\t', 
                            index=False, 
                            header=False
                        )
                    else:
                        no_edit_barcodes.append(f)
            except Exception:
                print(e)
                errors.add(f)
            all_final_outputs += 1
            progress_inner.update(1)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

HBox(children=(IntProgress(value=0, max=920), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1417), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1179), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1491), HTML(value='')))

HBox(children=(IntProgress(value=0, max=958), HTML(value='')))

HBox(children=(IntProgress(value=0, max=807), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1329), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))

HBox(children=(IntProgress(value=0, max=721), HTML(value='')))

HBox(children=(IntProgress(value=0, max=988), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1486), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1050), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1676), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1465), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1350), HTML(value='')))

In [5]:
errors

set()

In [6]:
all_final_outputs

19155

In [7]:
no_edit_barcodes[:3]

['/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_AA/APOBEC_transient_possorted_genome_bam_MD-AAACCCACACTAACCA-1/results/APOBEC_transient_possorted_genome_bam_MD-AAACCCACACTAACCA-1.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_AA/APOBEC_transient_possorted_genome_bam_MD-AAACGCTTCCGATGTA-1/results/APOBEC_transient_possorted_genome_bam_MD-AAACGCTTCCGATGTA-1.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_AA/APOBEC_transient_possorted_genome_bam_MD-AAAGGTAGTCGATGCC-1/results/APOBEC_transient_possorted_genome_bam_MD-AAAGGTAGTCGATGCC-1.fwd.sorted.rmdup.readfiltered.formatted.varfiltered.snpfiltered.ranked.bed'

In [8]:
len(no_edit_barcodes)

693

In [9]:
rbfox2_missing_from_final_dataframe = pd.read_csv(os.path.join(input_dir, 'Barcodes_missing_from_edits_matrix.csv'), index_col=0)
rbfox2_missing_from_final_dataframe.head()

Unnamed: 0_level_0,n_edits,n_genes,ex_n_genes,ex_percent_mito,ex_n_counts,ex_louvain_02,cell,predicted_doublets
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAACGCTCAGTAACCT-1,,,446,0.100334,598.0,0,HEK,singlet
AAACGCTTCCGACATA-1,,,654,0.039377,1219.0,0,HEK,singlet
AACCACATCAGACCCG-1,,,506,0.033708,712.0,0,HEK,singlet
AAGACTCCACGTTCGG-1,,,995,0.037203,1559.0,0,HEK,singlet
AAGGTAAGTCAGTCCG-1,,,459,0.042466,730.0,0,HEK,singlet


In [10]:
bed_head = ['chrom','start','end','name','score','strand']

def get_path_from_barcode(barcode, input_dir_prefix=input_dir_prefix):
    prefix = barcode[:2]
    full_input_dir_prefix = input_dir_prefix + "_{}".format(prefix)
    barcode_dir = os.path.join(full_input_dir_prefix, '{}_transient_possorted_genome_bam_MD-{}'.format("RBFOX2", barcode), "results")
    return barcode_dir

def get_num_edits(barcode_dir):
    fwd, rev = sorted(glob.glob(os.path.join(barcode_dir, '*.ranked.bed')))
    fwd_df = pd.read_csv(fwd, sep='\t', names=bed_head)
    rev_df = pd.read_csv(rev, sep='\t', names=bed_head)
    try:
        assert fwd_df.shape[0] == 0 and rev_df.shape[0] == 0
    except AssertionError as e:
        print("{} -> has {} fwd and {} rev edits".format(os.path.basename(barcode_dir), fwd_df.shape[0], rev_df.shape[0]))
    
print(get_path_from_barcode('AAACGCTCAGTAACCT-1'))
get_num_edits(get_path_from_barcode('AAACGCTCAGTAACCT-1'))

/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_AA/RBFOX2_transient_possorted_genome_bam_MD-AAACGCTCAGTAACCT-1/results


In [11]:
progress = tnrange(len(rbfox2_missing_from_final_dataframe.index))
for missing_barcode in rbfox2_missing_from_final_dataframe.index:
    get_num_edits(get_path_from_barcode(missing_barcode))
    progress.update(1)

HBox(children=(IntProgress(value=0, max=304), HTML(value='')))