# reads bed file outputs containing edits and converts to bedgraph based on various conf levels.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import numpy as np
import glob
from tqdm import tnrange, tqdm_notebook

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedgraphs'
ref_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/inputs/'
tmp_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/tmp/'

# Read in expression values from Alex to help us decide which barcodes to look for/split.

original sources: 
```
RPS2_STAMP_Apo_filtered_lenti_common_expression.csv
APOBEC_STAMP_Apo_filtered_lenti_common_expression.csv
```

In [3]:
rbfox2_bcs = pd.read_csv(os.path.join(ref_dir, 'barcodes_celltype_edits_plasmid_expression_ranked_kb.csv'), index_col=0)
del rbfox2_bcs['Unnamed: 4']
del rbfox2_bcs['Unnamed: 7']
rbfox2_bcs.head()

Unnamed: 0_level_0,HEK top 10 n_edits,HEK APOBEC_RBFOX2_transient_normalized,NPC_DCX top 10 n_edits,NPC_DCX APOBEC_RBFOX2_transient_normalized,NPC_SOX2 top 10 n_edits,NPC_SOX2 APOBEC_RBFOX2_transient_normalized
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,GCACGTGAGAAGTATC-1,CACGGGTTCTAAGAAG-1,CACAACATCGCCTTGT-1,ACTTTCACATTCGGGC-1,CATCAAGCAACTGCCG-1,GGGTTATGTAGTCACT-1
2,AGTGCCGGTACGGCAA-1,CCCTAACCAAGGTACG-1,TCGATTTTCCGAGGCT-1,CACAACATCGCCTTGT-1,CATGAGTGTGCATTTG-1,ACGTAACCACGCGCTA-1
3,CGATGCGTCCATATGG-1,TGTTTGTAGCGGTAAC-1,AGGGCCTGTGCTCTCT-1,ACCAAACCATTGTCGA-1,GGTGGCTGTTAATGAG-1,GTTCGCTCAGGGACTA-1
4,GAGTTTGGTGGCTCTG-1,TTGACCCGTCTGTGCG-1,ACGTACAAGTGGTCAG-1,CATGAGTTCGCGCCAA-1,TAGCACAGTTGCCGCA-1,CAGCACGTCACGGACC-1
5,TGGATCAGTGCATTAC-1,AGTGCCGCAGTGAGCA-1,GGATCTACACGGAAGT-1,AGCTTCCTCGTGCACG-1,GGACGTCCAGGTAGTG-1,GGACGTCCAGGTAGTG-1


# From our list of barcodes, get the path to the corresponding bed file

In [4]:
def barcode_to_bedfile(barcode, bedfile_dir, rbp='RBFOX2'):
    fn = os.path.join(bedfile_dir, '{}_transient_possorted_genome_bam_MD-{}.bed'.format(rbp, barcode))
    assert os.path.exists(fn)
    return fn
# test
barcode_to_bedfile(
    barcode='GCACGTGAGAAGTATC-1',
    bedfile_dir=input_dir,
    rbp='RBFOX2'
)

'/home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/RBFOX2_transient_possorted_genome_bam_MD-GCACGTGAGAAGTATC-1.bed'

In [5]:
progress = tnrange(len(rbfox2_bcs.columns)*rbfox2_bcs.shape[0])
all_bed_files = []
for column in rbfox2_bcs.columns:
    for bc in rbfox2_bcs[column]:
        all_bed_files.append(
            barcode_to_bedfile(
                bc,
                bedfile_dir=input_dir,
                rbp='RBFOX2'
            )
        )
        progress.update(1)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

In [6]:
def recompute_edit_fraction(row):
    edit, cov = row['frac'].split(',')
    return int(edit)/float(cov)

def filter_bed_and_convert_to_bg(input_bed, output_bg, conf):
    edit_head = ['chrom','start','end','conf','frac','strand']
    df = pd.read_csv(input_bed, names=edit_head, sep='\t')
    try:
        df.sort_values(by=['chrom','start','end'], inplace=True)
        df = df[(df['conf']>=conf)]
        df['name'] = df.apply(recompute_edit_fraction, axis=1)
        df[['chrom','start','end','name']].to_csv(output_bg, sep='\t', header=False, index=False)
    except ValueError as e:
        print(e, input_bed, conf)
    



In [7]:
confs = [0.5, 0.9, 0.99, 0.999, 1]      
genome = '/projects/ps-yeolab3/bay001/annotations/hg19/hg19.chrom.sizes'

In [8]:
progress = tnrange(len(all_bed_files)*len(confs))
for conf in confs:
    for bed in all_bed_files:
        output_bg = os.path.join(output_dir, os.path.basename(bed) + ".{}.bedgraph".format(conf))
        filter_bed_and_convert_to_bg(bed, output_bg, conf)
        progress.update(1)

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))

Cannot set a frame with no defined index and a value that cannot be converted to a Series /home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/RBFOX2_transient_possorted_genome_bam_MD-CACGGGTTCTAAGAAG-1.bed 0.9
Cannot set a frame with no defined index and a value that cannot be converted to a Series /home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/RBFOX2_transient_possorted_genome_bam_MD-CCCTAACCAAGGTACG-1.bed 0.9
Cannot set a frame with no defined index and a value that cannot be converted to a Series /home/bay001/projects/kris_apobec_20200121/permanent_data2/05_scRNA_RBFOX2_APO_transient/sailor_outputs_individual_barcodes_merged_bedfiles/RBFOX2_transient_possorted_genome_bam_MD-CACGGGTTCTAAGAAG-1.bed 0.99
Cannot set a frame with no defined index and a value that cannot be converted to a Series /home/bay001/pr

# Get topN edits for APOBEC

In [9]:
apobec_bcs = pd.read_excel(os.path.join(ref_dir, 'APOBEC_HEKvsNPC_edits_all_info_sorted.xlsx'), index_col=0)
apobec_bcs.head()

Unnamed: 0,n_edits,n_genes,ex_n_genes,ex_n_counts,cell,ex_transgene
TTCCACGGTGACGCCT-1,0.180739,83,7009,46806,NPC,0.055914
TCCGAAATCTCGACCT-1,0.172029,82,6979,48881,NPC,0.058439
TTACGTTAGGTCATAA-1,0.169845,68,6413,40534,NPC,0.031523
CACGAATGTGTCATTG-1,0.1695,75,6445,37356,NPC,0.009722
AGCATCACAGCGTGCT-1,0.16684,64,6370,37880,NPC,-0.00078


In [10]:
top = 10

apobec_bcs_npc = list(apobec_bcs[apobec_bcs['cell']=='NPC'].sort_values(by=['n_edits'], ascending=False).iloc[:top].index) # sorted by rank
apobec_bcs_hek = list(apobec_bcs[apobec_bcs['cell']=='HEK'].sort_values(by=['n_edits'], ascending=False).iloc[:top].index) # sorted by rank

apobec_bcs_hek

['ATCGTCCAGCAAGTGC-1',
 'GTGAGCCAGCAGCAGT-1',
 'GAAGCCCCACAGCCTG-1',
 'GTGCACGTCCACACCT-1',
 'GTTCGCTCAAACTAAG-1',
 'CACCAAACAGTTAGAA-1',
 'CCGGACAAGGATAATC-1',
 'TTCGCTGGTTATAGCC-1',
 'GTAGAAACAGCTATAC-1',
 'GACCAATGTTTCAGAC-1']

In [11]:
bc_groups = [apobec_bcs_npc, apobec_bcs_hek]

progress = tnrange(len(bc_groups)*top)

for bcs in bc_groups: # NPC, HEK
    rank = 1
    for bc in bcs: # for each barcode, should be already ranked by n_edits
        bed = barcode_to_bedfile(
            bc,
            bedfile_dir=input_dir,
            rbp='APOBEC'
        )
        for conf in confs: # 
            output_bg = os.path.join(output_dir, os.path.basename(bed) + ".rank-{}.{}.bedgraph".format(rank, conf))
            filter_bed_and_convert_to_bg(bed, output_bg, conf)
        rank += 1
        progress.update(1)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))