# Now we should have editC scores from all cells (minus the cells that did not have any edits). Join all the scored files together in this notebook.
- Some cells may have editC of zero in all cells, even if they were found to have edits. We should check this but when I did, they were just cells where edits existed only within introns. There should only be a few cases like this and it should explain ALL cells with zero editC scores across the board. 
- **This is a critical time to check that ALL jobs have run properly for ALL cells**. 

In [1]:
import glob
import os
import pandas as pd
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_scores_cds_only/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs'

In [3]:
all_apo_scored = sorted(glob.glob(os.path.join(input_dir, 'Apo_Control*.exons.merged.txt')))
all_rps2_scored = sorted(glob.glob(os.path.join(input_dir, 'RPS2*.exons.merged.txt')))

print(len(all_apo_scored), len(all_rps2_scored))
all_rps2_scored[:3]

8616 10995


['/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_scores_cds_only/RPS2_possorted_genome_bam_MD-AAACCCACAGGATCTT-1.fx.bed.annotated.exons.merged.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_scores_cds_only/RPS2_possorted_genome_bam_MD-AAACCCACAGGTACGA-1.fx.bed.annotated.exons.merged.txt',
 '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA/sailor_outputs_individual_barcodes_merged_scores_cds_only/RPS2_possorted_genome_bam_MD-AAACCCACAGTTCACA-1.fx.bed.annotated.exons.merged.txt']

In [4]:
def format_name(fn):
    return os.path.basename(fn).replace('.fx.bed.annotated.exons.merged.txt','')

# I'm doing these merges in batches since merging over 20k cells takes too long
- better to merge 500 at a time and then merge each of the 34 dataframes together.

In [5]:
def chunker(seq, size):
    """
    Chunks a long list into groups of (size).
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

for score in ['edited_over_all_c', 'edited_over_edited_c']:
    all_merged = []
    groupsize = 500 # 500 sailor runs per job.
    progress = tnrange(len(all_apo_scored))
    for group in chunker(all_apo_scored, groupsize):
        merged = pd.DataFrame()
        for scored in group:
            df = pd.read_csv(scored, index_col=0, sep='\t')[[score]]
            df.columns = [format_name(scored)]
            merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
            progress.update(1)
        all_merged.append(merged)

    all_all_merged = pd.DataFrame()
    progress = tnrange(len(all_merged))
    for merged in all_merged:
        all_all_merged = pd.merge(all_all_merged, merged, how='outer', left_index=True, right_index=True)
        progress.update(1)

    print(all_all_merged.shape)
    all_all_merged.fillna(0, inplace=True)
    all_all_merged.to_csv(os.path.join(output_dir, 'Apo_Control_possorted_genome_bam_MD.exons.merged.{}.cds_only.tsv'.format(score)), sep='\t', header=True, index=True)

HBox(children=(IntProgress(value=0, max=8616), HTML(value='')))

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

(2287, 8616)


HBox(children=(IntProgress(value=0, max=8616), HTML(value='')))

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

(2287, 8616)


# Do the same with RPS2

In [6]:
for score in ['edited_over_all_c', 'edited_over_edited_c']:
    all_merged = []
    groupsize = 500 # 50 sailor runs per job.
    progress = tnrange(len(all_rps2_scored))
    for group in chunker(all_rps2_scored, groupsize):
        merged = pd.DataFrame()
        for scored in group:
            df = pd.read_csv(scored, index_col=0, sep='\t')[[score]]
            df.columns = [format_name(scored)]
            merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
            progress.update(1)
        all_merged.append(merged)

    all_all_merged = pd.DataFrame()
    progress = tnrange(len(all_merged))
    for merged in all_merged:
        all_all_merged = pd.merge(all_all_merged, merged, how='outer', left_index=True, right_index=True)
        progress.update(1)

    print(all_all_merged.shape)
    all_all_merged.fillna(0, inplace=True)
    all_all_merged.to_csv(os.path.join(output_dir, 'RPS2_possorted_genome_bam_MD.exons.merged.{}.cds_only.tsv'.format(score)), sep='\t', header=True, index=True)

HBox(children=(IntProgress(value=0, max=10995), HTML(value='')))

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

(2464, 10995)


HBox(children=(IntProgress(value=0, max=10995), HTML(value='')))

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

(2464, 10995)
