# Join all the scored files together

In [1]:
import glob
import os
import pandas as pd
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_scores_no_APO_filter'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/outputs'

In [3]:
all_rbfox2_scored = sorted(glob.glob(os.path.join(input_dir, 'RBFOX2*possorted_genome_bam_MD*.exons.merged.txt')))
all_apobec_scored = sorted(glob.glob(os.path.join(input_dir, 'APOBEC*possorted_genome_bam_MD*.exons.merged.txt')))

print(len(all_rbfox2_scored), len(all_apobec_scored))
all_rbfox2_scored[:3], all_apobec_scored[:3]

9169 9293


(['/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_scores_no_APO_filter/RBFOX2_transient_possorted_genome_bam_MD-AAACCCAAGGCTAAAT-1.fx.bed.annotated.exons.merged.txt',
  '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_scores_no_APO_filter/RBFOX2_transient_possorted_genome_bam_MD-AAACCCAAGGTGCTAG-1.fx.bed.annotated.exons.merged.txt',
  '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_scores_no_APO_filter/RBFOX2_transient_possorted_genome_bam_MD-AAACCCAAGTGCTCAT-1.fx.bed.annotated.exons.merged.txt'],
 ['/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_scores_no_APO_filter/APOBEC_transient_possorted_genome_bam_MD-AAACCCAAGAGAGGTA-1.fx.bed.annotated.exons.merged.t

In [4]:
def format_name(fn):
    return os.path.basename(fn).replace('.fx.bed.annotated.exons.merged.txt','')

# What I'm doing below is merging groups at a time, and then merging the groups. If we merge by tiers, we speed things up quite a bit.

In [5]:
def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def merge_scores(all_scored, output_file, score):
    all_merged = []
    groupsize = 500 # Merge 500 scored files at a time. 
    progress = tnrange(len(all_scored))
    for group in chunker(all_scored, groupsize):
        merged = pd.DataFrame()
        for scored in group:
            df = pd.read_csv(scored, index_col=0, sep='\t')[[score]]
            df.columns = [format_name(scored)]
            merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
            progress.update(1)
        all_merged.append(merged)
    print(len(all_merged))
    
    all_all_merged = pd.DataFrame()
    progress = tnrange(len(all_merged))
    for merged in all_merged:
        all_all_merged = pd.merge(all_all_merged, merged, how='outer', left_index=True, right_index=True)
        progress.update(1)
        
    all_all_merged.fillna(0, inplace=True)
    all_all_merged.to_csv(output_file, sep='\t', header=True, index=True)

In [6]:
score = 'edited_over_all_c'
merge_scores(
    all_scored=all_rbfox2_scored,
    output_file=os.path.join(output_dir, 'RBFOX2_transient_possorted_genome_bam_MD.exons.merged.{}.tsv'.format(score)),
    score=score
)
merge_scores(
    all_scored=all_apobec_scored,
    output_file=os.path.join(output_dir, 'APOBEC_transient_possorted_genome_bam_MD.exons.merged.{}.tsv'.format(score)),
    score=score
)

HBox(children=(IntProgress(value=0, max=9169), HTML(value='')))

19


HBox(children=(IntProgress(value=0, max=19), HTML(value='')))

HBox(children=(IntProgress(value=0, max=9293), HTML(value='')))

19


HBox(children=(IntProgress(value=0, max=19), HTML(value='')))