In [1]:
import pandas as pd
from pybedtools import BedTool
import os
import glob
from tqdm import tnrange, tqdm_notebook
import numpy as np
import math
from scipy.stats import binom
from scipy.special import betainc

In [2]:
pd.options.display.max_columns = 999

In [3]:
def total(df):
    if df['DP']<1:
        return  0
    else:
        return (1- betainc(df['total_miss'], df['total_match'], 0.05))

In [4]:
input_dir= '/Path to save directory/'
output_dir= '/Path to save directory/tmp_bedgraphs/'

In [2]:
beds = sorted(glob.glob(os.path.join(input_dir, '*Filename.csv')))

NameError: name 'input_dir' is not defined

In [12]:
##Imports csv files, C>T and G>A filters and assigns confidence score
for bed in beds:
    df=pd.read_csv(bed)
    tmp1=df[df['3']=='C']
    tmp1=tmp1[tmp1['miss_base']=='T']
    tmp2=df[df['3']=='G']
    tmp2=tmp2[tmp2['miss_base']=='A']
    df=pd.concat([tmp1,tmp2])
    df['stop']=df['1']+1
    df['total_match']=df['fwd']+df['rev']
    df['total_miss']=df['fwd_miss']+df['rev_miss']
    df['total_fraction']=df['total_miss']/(df['total_miss']+df['total_match'])
    df['total_conf']=df.apply(total, axis=1)
    df=df[df['total_conf']>=0.5]
    df[['0','1','stop','total_fraction','total_conf','3']].to_csv(bed+'.conf0.5_TA_filtered.bed', sep='\t', header=None, index=None )

In [26]:
beds = sorted(glob.glob(os.path.join(output_dir, '*filtered.bed')))

In [27]:
beds

['/oasis/tscc/scratch/d2lorenz/nanopore/promethion/mpileup/tmp_bedgraphs/APOBEC_control_df_hg19.csv.conf0.5_TA_filtered.bed',
 '/oasis/tscc/scratch/d2lorenz/nanopore/promethion/mpileup/tmp_bedgraphs/RBFOX_df_hg19.csv.conf0.5_TA_filtered.bed',
 '/oasis/tscc/scratch/d2lorenz/nanopore/promethion/mpileup/tmp_bedgraphs/RPS2_df_hg19.csv.conf0.5_TA_filtered.bed',
 '/oasis/tscc/scratch/d2lorenz/nanopore/promethion/mpileup/tmp_bedgraphs/SRSF_df_hg19.csv.conf0.5_TA_filtered.bed']

In [28]:
##Intersects sites to get strand info based on gene
gtf=BedTool('/projects/ps-yeolab3/bay001/annotations/hg19/gencode_v19/gencode.v19.annotation.gtf')
for bed in beds:
    df=BedTool(bed)
    df1=df.intersect(gtf, wa=True,wb=True)
    SP=output_dir+bed.split('/')[9]+'.annotated.bed'
    df1.saveas(SP)

In [29]:
df=0
df1=0
SP=0

In [30]:
beds = sorted(glob.glob(os.path.join(output_dir, '*.annotated.bed')))

In [33]:
def stranded_f(df):
    if (df[12]=='+')&(df[5]=='C'):
        return  1
    elif (df[12]=='-')&(df[5]=='G'):
        return  1
    else:
        return 0

In [35]:
##Assign strands based on gene and applies GC filter. Removes ambiguous sites 
for bed in beds:
    df=pd.read_csv(bed, sep='\t', header=None)
    df=df[df[8]=='exon']
    print(len(df.drop_duplicates([0,1,12])))
    df=df.drop_duplicates([0,1,12]).drop_duplicates([0,1],keep=False)
    print(len(df))
    df['filter']=df.apply(stranded_f, axis=1)
    df=df[df['filter']==1]
    print(len(df))
    SP=bed+'.exon_strand.bed'
    df[[0,1,2,3,4,12]].to_csv(SP, sep='\t', header=None, index=None)

1666182
1565278
856785
1526687
1438227
862585
1467828
1381112
853284
1327692
1241108
717628


In [36]:
beds = sorted(glob.glob(os.path.join(output_dir, '*.exon_strand.bed')))

In [38]:
df=pd.read_csv(beds[0], sep='\t', header=None)

In [41]:
df=pd.DataFrame()
##Makes bedgraphs with no filters
for bed in beds:
    tmp=pd.read_csv(bed, sep='\t', header=None)
    SP=bed+'.0.5sailor.fraction.none.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.5sailor.sailor_conf.none.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
    tmp=tmp[tmp[4]>=0.99]
    SP=bed+'.0.99sailor.fraction.none.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.99sailor.sailor_conf.none.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
##makes bedgraphs filtering for unique sites (across all CSVs)
for bed in beds:
    tmp=pd.read_csv(bed, sep='\t', header=None)
    tmp['gene']=bed.split("/")[9].split('_')[0]
    df=pd.concat([df,tmp])
df1=df[df[4]>=0.5].drop_duplicates([0,1,5], keep=False).copy()
for bed in beds:
    gene=bed.split("/")[9].split('_')[0]
    tmp=df1[df1['gene']==gene].copy()
    SP=bed+'.0.5sailor.fraction.all.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.5sailor.sailor_conf.all.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
df1=df[df[4]>=0.99].drop_duplicates([0,1,5], keep=False).copy()
for bed in beds:
    gene=bed.split("/")[9].split('_')[0]
    tmp=df1[df1['gene']==gene].copy()
    SP=bed+'.0.99sailor.fraction.all.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.99sailor.sailor_conf.all.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)


In [42]:
##Makes bedgraphs filtering for APO sites
for bed in beds:
    gene=bed.split("/")[9].split('_')[0]
    tmp=df[df['gene'].isin(['APOBEC',gene])].copy()
    tmp2=tmp[tmp[4]>=0.5].drop_duplicates([0,1,5], keep=False)
    tmp2=tmp2[tmp2['gene']==gene].copy()
    SP=bed+'.0.5sailor.fraction.APO.bedgraph'
    tmp2[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.5sailor.sailor_conf.APO.bedgraph'
    tmp2[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
    tmp2=tmp[tmp[4]>=0.99].drop_duplicates([0,1,5], keep=False)
    tmp2=tmp2[tmp2['gene']==gene].copy()
    SP=bed+'.0.99sailor.fraction.APO.bedgraph'
    tmp2[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.99sailor.sailor_conf.APO.bedgraph'
    tmp2[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)

In [43]:
final_dir='/Path to save directory/tmp_bedgraphs/0_based/'
beds = sorted(glob.glob(os.path.join(output_dir, '*.bedgraph')))

In [44]:
##Makes bedgraphs 0 instead of 1 based
for bed in beds:
    df= pd.read_csv(bed, sep='\t', header=None)
    gene=bed.split("/")[9].split('_')[0]
    FN=bed.split("/")[9].split('bed.')[3]
    df[1]=df[1]-1
    df[2]=df[2]-1
    SP=final_dir+gene+'_0-based.'+FN
    df.to_csv(SP, sep='\t', header=None, index=None)

In [45]:
beds = sorted(glob.glob(os.path.join(final_dir, '*.bedgraph')))

In [None]:
##Removes SNPs
SNPS=BedTool('/projects/ps-yeolab3/bay001/annotations/hg19/hg19.commonSNPs147.bed3')
for bed in beds:
    df=BedTool(bed)
    df1=df.subtract(SNPS)
    SP=final_dir+bed.split("/")[10].split('bedgraph')[0]+'rmSNP.bedgraph'
    df1.saveas(SP)

In [None]:
##site_bed files (EditC)
input_dir= '/Path to save directory/'
output_dir= '/Path to save directory/cDNA_hg19/'
beds2_dir='/Path to save directory//d-cDNA/'
beds = sorted(glob.glob(os.path.join(input_dir, '*hg19.csv')))
for bed in beds:
    df= pd.read_csv(bed)
    data=bed.split('/')[8].split('_')[0]
    df2=pd.read_csv(beds2_dir+str(data)+'_0-based.0.5sailor.fraction.APO.rmSNP.bedgraph', header=None, sep='\t',names=[0,1,2,3,4,'strand'])
    df['key']=df['0']+":"+df['1'].astype(str)
    df2['key']=df2[0]+":"+df2[2].astype(str)
    set1=set(df['key'])
    set2=set(df2['key'])
    print(len(set2))
    len(set1.intersection(set2))
    mapping = dict(df2[['key', 'strand']].values)
    df['strand_real'] = df.key.map(mapping)
    print(len(df))
    print(len(df.dropna()))
    df['total_match']=df['fwd']+df['rev']
    df['total_miss']=df['fwd_miss']+df['rev_miss']
    df['total']=df['total_match']+df['total_miss']
    df['miss,total']=df['total_miss'].astype(str)+','+df['total'].astype(str)
    df['total_confidence']=df.apply(total, axis=1)
    SP=output_dir+data+"_cDNA_hg19_sites.bed"
    df['stop']=df['1']+1
    df=df.fillna(0)
    df[['0','1','stop','total_confidence','miss,total','strand_real']].to_csv(SP, sep='\t', header=None, index=None)

848624
19277560
848624
