In [2]:
import pandas as pd
from pybedtools import BedTool
import os
import glob
from tqdm import tnrange, tqdm_notebook
import numpy as np
import math
from scipy.stats import binom
from scipy.special import betainc

In [3]:
pd.options.display.max_columns = 999

In [4]:
def total(df):
    if df['DP']<1:
        return  0
    else:
        return (1- betainc(df['total_miss'], df['total_match'], 0.05))

In [5]:
input_dir= '/Path to save directory/'
output_dir= '/Path to save directory/tmp_bedgraphs_cDNA/'

In [6]:
beds = sorted(glob.glob(os.path.join(input_dir, '*Filename.csv')))

In [10]:
##Imports csv files, C>T and G>A filters and assigns confidence score
for bed in beds:
    df=pd.read_csv(bed)
    df=df[df['3']=='C']
    df=df[df['miss_base']=='T']
    df['stop']=df['1']+1
    df['total_match']=df['fwd']+df['rev']
    df['total_miss']=df['fwd_miss']+df['rev_miss']
    df['total_fraction']=df['total_miss']/(df['total_miss']+df['total_match'])
    df['total_conf']=df.apply(total, axis=1)
    df=df[df['total_conf']>=0.5]
    SP=output_dir+bed.split('/')[8]
    df['strand']='+'
    df[['0','1','stop','total_fraction','total_conf','strand']].to_csv(SP+'.cDNA.conf0.5_TA_filtered.bed', sep='\t', header=None, index=None )

In [11]:
beds = sorted(glob.glob(os.path.join(output_dir, '*.cDNA.conf0.5_TA_filtered.bed')))

In [12]:
df=pd.DataFrame()
##Makes bedgraphs with no filters
for bed in beds:
    tmp=pd.read_csv(bed, sep='\t', header=None)
    SP=bed+'.0.5sailor.fraction.none.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.5sailor.sailor_conf.none.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
    tmp=tmp[tmp[4]>=0.99]
    SP=bed+'.0.99sailor.fraction.none.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.99sailor.sailor_conf.none.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
##makes bedgraphs filtering for unique sites
for bed in beds:
    tmp=pd.read_csv(bed, sep='\t', header=None)
    tmp['gene']=bed.split("/")[9].split('_')[0]
    df=pd.concat([df,tmp])
df1=df[df[4]>=0.5].drop_duplicates([0,1,5], keep=False).copy()
for bed in beds:
    gene=bed.split("/")[9].split('_')[0]
    tmp=df1[df1['gene']==gene].copy()
    SP=bed+'.0.5sailor.fraction.all.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.5sailor.sailor_conf.all.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
df1=df[df[4]>=0.99].drop_duplicates([0,1,5], keep=False).copy()
for bed in beds:
    gene=bed.split("/")[9].split('_')[0]
    tmp=df1[df1['gene']==gene].copy()
    SP=bed+'.0.99sailor.fraction.all.bedgraph'
    tmp[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.99sailor.sailor_conf.all.bedgraph'
    tmp[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)


In [13]:
##Makes bedgraphs filtering for APO sites
for bed in beds:
    gene=bed.split("/")[9].split('_')[0]
    tmp=df[df['gene'].isin(['APOBEC',gene])].copy()
    tmp2=tmp[tmp[4]>=0.5].drop_duplicates([0,1,5], keep=False)
    tmp2=tmp2[tmp2['gene']==gene].copy()
    SP=bed+'.0.5sailor.fraction.APO.bedgraph'
    tmp2[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.5sailor.sailor_conf.APO.bedgraph'
    tmp2[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)
    tmp2=tmp[tmp[4]>=0.99].drop_duplicates([0,1,5], keep=False)
    tmp2=tmp2[tmp2['gene']==gene].copy()
    SP=bed+'.0.99sailor.fraction.APO.bedgraph'
    tmp2[[0,1,2,3,4,5]].to_csv(SP, sep='\t', header=None, index=None)
    SP=bed+'.0.99sailor.sailor_conf.APO.bedgraph'
    tmp2[[0,1,2,4,3,5]].to_csv(SP, sep='\t', header=None, index=None)

In [14]:
final_dir='/Path to save directory/mpileup/tmp_bedgraphs_cDNA/0_based/'
beds = sorted(glob.glob(os.path.join(output_dir, '*.bedgraph')))

In [18]:
###Double check if this is needed.
##Makes bedgraphs 0 instead of 1 based
for bed in beds:
    df= pd.read_csv(bed, sep='\t', header=None)
    gene=bed.split("/")[9].split('_')[0]
    FN=bed.split("/")[9].split('bed.')[1]
    df[1]=df[1]-1
    df[2]=df[2]-1
    SP=final_dir+gene+'_0-based.'+FN
    df.to_csv(SP, sep='\t', header=None, index=None)

In [21]:
##site_bed files (Used for EditC fractions)
input_dir= '/Path to save directory/'
output_dir= '/Path to save directory/site_files/cDNA_cDNA/'
beds = sorted(glob.glob(os.path.join(input_dir, '*cDNA_df_2020_03_26.csv')))
for bed in beds:
    df= pd.read_csv(bed)
    df['strand']='+'
    df['total']=df['total_match']+df['total_miss']
    df['miss,total']=df['total_miss'].astype(str)+','+df['total'].astype(str)
    SP=output_dir+bed.split('/')[8].split('_')[0]+"_cDNA_ENSEMBL_sites.bed"
    df[['0','1','stop','total_confidence','miss,total','strand']].to_csv(SP, sep='\t', header=None, index=None)