# We need bigwig files for fast access to coverage (used by the score_edits_* scripts)

In [1]:
import glob
import pandas as pd
import os
from tqdm import tnrange, tqdm_notebook

In [2]:
bed_file_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/sailor_outputs_individual_barcodes_merged_bedfiles'
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/inputs/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/bedgraph_files'

In [3]:
top = 10

In [4]:
apo_barcodes = pd.read_csv(os.path.join(input_dir, 'APOBEC_STAMP_Apo_filtered_lenti_common_expression.csv'))
rps2_barcodes = pd.read_csv(os.path.join(input_dir, 'RPS2_STAMP_Apo_filtered_lenti_common_expression.csv'))

apo_barcodes.head()

Unnamed: 0,index,batch,lenti_common_normalized,lenti_common,new_clusters
0,Apo_Control_possorted_genome_bam_MD-AAACCCAAGC...,Apo-Edits,-0.809566,0.0,APOBEC_STAMP
1,Apo_Control_possorted_genome_bam_MD-AAACCCAAGG...,Apo-Edits,-0.782604,0.0,APOBEC_STAMP
2,Apo_Control_possorted_genome_bam_MD-AAACCCACAA...,Apo-Edits,-0.434219,1.0,APOBEC_STAMP
3,Apo_Control_possorted_genome_bam_MD-AAACCCACAC...,Apo-Edits,-0.545899,1.0,APOBEC_STAMP
4,Apo_Control_possorted_genome_bam_MD-AAACCCACAG...,Apo-Edits,1.697777,28.0,APOBEC_STAMP


In [5]:
rps2_barcodes.sort_values(by=['lenti_common_normalized'], ascending=False, inplace=True)
apo_barcodes.sort_values(by=['lenti_common_normalized'], ascending=False, inplace=True)

rps2_barcodes = rps2_barcodes.head(top)
apo_barcodes = apo_barcodes.head(top)

rps2_barcodes

Unnamed: 0,index,batch,lenti_common_normalized,lenti_common,new_clusters
22,RPS2_possorted_genome_bam_MD-AAAGTCCCACGTACAT-...,RPS2-Edits,4.660247,180.0,RPS2_STAMP
1401,RPS2_possorted_genome_bam_MD-CGGCAGTGTGCGTGCT-...,RPS2-Edits,3.951416,149.0,RPS2_STAMP
753,RPS2_possorted_genome_bam_MD-ATGAGGGCAATAGAGT-...,RPS2-Edits,3.776582,108.0,RPS2_STAMP
189,RPS2_possorted_genome_bam_MD-ACACAGTTCGTCTAAG-...,RPS2-Edits,3.666702,16.0,RPS2_STAMP
2461,RPS2_possorted_genome_bam_MD-GTGCAGCTCACGATAC-...,RPS2-Edits,3.622875,82.0,RPS2_STAMP
1144,RPS2_possorted_genome_bam_MD-CCAATTTAGAGTTGCG-...,RPS2-Edits,3.516732,134.0,RPS2_STAMP
3231,RPS2_possorted_genome_bam_MD-TGCTTCGAGATGACCG-...,RPS2-Edits,3.499275,76.0,RPS2_STAMP
3554,RPS2_possorted_genome_bam_MD-TTGTTTGGTCGGATTT-...,RPS2-Edits,3.45946,100.0,RPS2_STAMP
584,RPS2_possorted_genome_bam_MD-AGTCAACAGTGGACGT-...,RPS2-Edits,3.453263,67.0,RPS2_STAMP
193,RPS2_possorted_genome_bam_MD-ACACCAATCCTAAGTG-...,RPS2-Edits,3.398006,37.0,RPS2_STAMP


In [6]:
def index_to_bed(row, to_replace):
    """
    """
    barcode = row['index']
    for replace in to_replace:
        barcode = barcode.replace(replace, "")
        
    bed = os.path.join(bed_file_dir, barcode + ".fx.bed")
    assert os.path.exists(bed)
    return bed

rps2_barcodes['bed'] = rps2_barcodes.apply(index_to_bed, args=(['-RPS2-Edits'],), axis=1)
apo_barcodes['bed'] = apo_barcodes.apply(index_to_bed, args=(['-Apo-Edits'],), axis=1)
rps2_barcodes.head()

Unnamed: 0,index,batch,lenti_common_normalized,lenti_common,new_clusters,bed
22,RPS2_possorted_genome_bam_MD-AAAGTCCCACGTACAT-...,RPS2-Edits,4.660247,180.0,RPS2_STAMP,/home/bay001/projects/kris_apobec_20200121/per...
1401,RPS2_possorted_genome_bam_MD-CGGCAGTGTGCGTGCT-...,RPS2-Edits,3.951416,149.0,RPS2_STAMP,/home/bay001/projects/kris_apobec_20200121/per...
753,RPS2_possorted_genome_bam_MD-ATGAGGGCAATAGAGT-...,RPS2-Edits,3.776582,108.0,RPS2_STAMP,/home/bay001/projects/kris_apobec_20200121/per...
189,RPS2_possorted_genome_bam_MD-ACACAGTTCGTCTAAG-...,RPS2-Edits,3.666702,16.0,RPS2_STAMP,/home/bay001/projects/kris_apobec_20200121/per...
2461,RPS2_possorted_genome_bam_MD-GTGCAGCTCACGATAC-...,RPS2-Edits,3.622875,82.0,RPS2_STAMP,/home/bay001/projects/kris_apobec_20200121/per...


In [7]:
def calculate_edit_fraction(row):
    edited, total = row['score'].split(',')
    return int(edited)/float(total)


def convert_to_bedgraph(input_bed, output_dir, conf):
    df = pd.read_csv(input_bed, sep='\t', names=['chrom','start','end','conf','score','strand'])
    df = df[df['conf'] >= conf]
    df['editfrac'] = df.apply(calculate_edit_fraction, axis=1)
    return df[['chrom','start','end','editfrac']], os.path.join(output_dir, os.path.basename(input_bed).replace('.bed','.conf{}.bedgraph'.format(conf)))

In [8]:
confs = [0.0, 0.5, 0.9, 0.99, 0.999, 1]

In [9]:
progress = tnrange(len(rps2_barcodes['bed'])*len(confs))
for conf in confs:
    for bed in rps2_barcodes['bed']:
        df, bedgraph_file = convert_to_bedgraph(bed, output_dir, conf)
        df.to_csv(bedgraph_file, sep='\t', header=False, index=False)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

In [10]:
progress = tnrange(len(apo_barcodes['bed'])*len(confs))
for conf in confs:
    for bed in rps2_barcodes['bed']:
        df, bedgraph_file = convert_to_bedgraph(bed, output_dir, conf)
        df.to_csv(bedgraph_file, sep='\t', header=False, index=False)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

In [11]:
confs = [0.0, 0.5, 0.9, 0.99, 0.999, 1]
progress = tnrange(len(rps2_barcodes['bed'])*len(confs))
for conf in confs:
    for bed in rps2_barcodes['bed']:
        df, bedgraph_file = convert_to_bedgraph(bed, output_dir, conf)
        df.to_csv(bedgraph_file, sep='\t', header=False, index=False)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))