In [1]:
import pandas as pd
import numpy as np
import os
import glob

In [2]:
barcodes_file = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/inputs/Top_100_apo_rps2_barcodes.xlsx'

tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/tmp'
bed_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/sailor_outputs_individual_barcodes_merged_bedfiles/'
bedgraph_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/sailor_outputs_individual_barcodes_merged_bedgraphs'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/03_scRNA_APOBEC_RPS2/outputs'

In [3]:
conf = 0.5
top = 100

In [4]:
apo_barcodes = pd.read_excel(barcodes_file, sheet_name='Sheet1')
apo_barcodes.sort_values(by=['n_edits'], ascending=False, inplace=True)
apo_barcodes.reset_index(drop=True, inplace=True)
apo_barcodes['rank'] = apo_barcodes.index + 1

rps2_barcodes = pd.read_excel(barcodes_file, sheet_name='Sheet2')
rps2_barcodes.sort_values(by=['n_edits'], ascending=False, inplace=True)
rps2_barcodes.reset_index(drop=True, inplace=True)
rps2_barcodes['rank'] = rps2_barcodes.index + 1

rps2_barcodes.head()

Unnamed: 0,barcode,class,n_edits,batch,rank
0,RPS2_possorted_genome_bam_MD-CGGCAGTGTGCGTGCT-...,RPS2-Edits,5.986034,RPS2-fSTAMP,1
1,RPS2_possorted_genome_bam_MD-ACTGTGATCAGACCGC-...,RPS2-Edits,4.031267,RPS2-fSTAMP,2
2,RPS2_possorted_genome_bam_MD-GACCCAGCACTTCAAG-...,RPS2-Edits,3.897882,RPS2-fSTAMP,3
3,RPS2_possorted_genome_bam_MD-CTCATCGGTCGAGCAA-...,RPS2-Edits,3.826919,RPS2-fSTAMP,4
4,RPS2_possorted_genome_bam_MD-ACAGCCGAGCCTGGAA-...,RPS2-Edits,3.793303,RPS2-fSTAMP,5


In [5]:
def get_sailor_bed_from_barcode_index(row, input_dir=bed_dir):
    barcode = row['barcode']
    bed = os.path.join(input_dir, '{}.bed'.format(barcode.replace('-Apo-Edits','').replace('-RPS2-Edits','')))
    assert os.path.exists(bed)
    return bed

In [6]:
apo_barcodes['bed'] = apo_barcodes.apply(get_sailor_bed_from_barcode_index, axis=1)
rps2_barcodes['bed'] = rps2_barcodes.apply(get_sailor_bed_from_barcode_index, axis=1)

rps2_barcodes.head()

Unnamed: 0,barcode,class,n_edits,batch,rank,bed
0,RPS2_possorted_genome_bam_MD-CGGCAGTGTGCGTGCT-...,RPS2-Edits,5.986034,RPS2-fSTAMP,1,/home/bay001/projects/kris_apobec_20200121/per...
1,RPS2_possorted_genome_bam_MD-ACTGTGATCAGACCGC-...,RPS2-Edits,4.031267,RPS2-fSTAMP,2,/home/bay001/projects/kris_apobec_20200121/per...
2,RPS2_possorted_genome_bam_MD-GACCCAGCACTTCAAG-...,RPS2-Edits,3.897882,RPS2-fSTAMP,3,/home/bay001/projects/kris_apobec_20200121/per...
3,RPS2_possorted_genome_bam_MD-CTCATCGGTCGAGCAA-...,RPS2-Edits,3.826919,RPS2-fSTAMP,4,/home/bay001/projects/kris_apobec_20200121/per...
4,RPS2_possorted_genome_bam_MD-ACAGCCGAGCCTGGAA-...,RPS2-Edits,3.793303,RPS2-fSTAMP,5,/home/bay001/projects/kris_apobec_20200121/per...


In [7]:
def calculate_edit_fraction(row):
    edited, total = row['score'].split(',')
    return int(edited)/float(total)

def bed_to_bedgraph(bed, bedgraph_output_file, bed_output_file, conf):
    df = pd.read_csv(bed, sep='\t', names=['chrom','start','end','name','score','strand'])
    df['chromfx'] = 'chr' + df['chrom'].astype(str)
    df['editfrac'] = df.apply(calculate_edit_fraction, axis=1)
    df = df[df['name'] >= conf]
    df[['chromfx','start','end','name','editfrac','strand']].to_csv(bed_output_file, sep='\t', index=False, header=False)
    df[['chromfx','start','end','editfrac']].to_csv(bedgraph_output_file, sep='\t', index=False, header=False)

In [8]:
for row in apo_barcodes.iterrows():
    output_file_prefix = os.path.join(bedgraph_dir, 'APOBEC-rank-{}.{}.n_edits{}.conf{}'.format(
        row[1]['rank'], row[1]['barcode'], row[1]['n_edits'], conf
    ))
    bedgraph_output_file = output_file_prefix + '.bedgraph'
    bed_output_file = output_file_prefix + '.bed'
    
    bed_to_bedgraph(row[1]['bed'], bedgraph_output_file, bed_output_file, conf)
    
for row in rps2_barcodes.iterrows():
    output_file_prefix = os.path.join(bedgraph_dir, 'RPS2-rank-{}.{}.n_edits{}.conf{}'.format(
        row[1]['rank'], row[1]['barcode'], row[1]['n_edits'], conf
    ))
    bedgraph_output_file = output_file_prefix + '.bedgraph'
    bed_output_file = output_file_prefix + '.bed'
    
    bed_to_bedgraph(row[1]['bed'], bedgraph_output_file, bed_output_file, conf)

In [9]:
rps2_barcodes

Unnamed: 0,barcode,class,n_edits,batch,rank,bed
0,RPS2_possorted_genome_bam_MD-CGGCAGTGTGCGTGCT-...,RPS2-Edits,5.986034,RPS2-fSTAMP,1,/home/bay001/projects/kris_apobec_20200121/per...
1,RPS2_possorted_genome_bam_MD-ACTGTGATCAGACCGC-...,RPS2-Edits,4.031267,RPS2-fSTAMP,2,/home/bay001/projects/kris_apobec_20200121/per...
2,RPS2_possorted_genome_bam_MD-GACCCAGCACTTCAAG-...,RPS2-Edits,3.897882,RPS2-fSTAMP,3,/home/bay001/projects/kris_apobec_20200121/per...
3,RPS2_possorted_genome_bam_MD-CTCATCGGTCGAGCAA-...,RPS2-Edits,3.826919,RPS2-fSTAMP,4,/home/bay001/projects/kris_apobec_20200121/per...
4,RPS2_possorted_genome_bam_MD-ACAGCCGAGCCTGGAA-...,RPS2-Edits,3.793303,RPS2-fSTAMP,5,/home/bay001/projects/kris_apobec_20200121/per...
5,RPS2_possorted_genome_bam_MD-GGGAAGTTCCCATACC-...,RPS2-Edits,3.773452,RPS2-fSTAMP,6,/home/bay001/projects/kris_apobec_20200121/per...
6,RPS2_possorted_genome_bam_MD-AATAGAGCAGCTACAT-...,RPS2-Edits,3.760434,RPS2-fSTAMP,7,/home/bay001/projects/kris_apobec_20200121/per...
7,RPS2_possorted_genome_bam_MD-GCCAGCATCCTACGGG-...,RPS2-Edits,3.665949,RPS2-fSTAMP,8,/home/bay001/projects/kris_apobec_20200121/per...
8,RPS2_possorted_genome_bam_MD-ATGGTTGAGGTTCTTG-...,RPS2-Edits,3.652080,RPS2-fSTAMP,9,/home/bay001/projects/kris_apobec_20200121/per...
9,RPS2_possorted_genome_bam_MD-ACGTAGTAGCTTCGTA-...,RPS2-Edits,3.645473,RPS2-fSTAMP,10,/home/bay001/projects/kris_apobec_20200121/per...
