In [1]:
import pandas as pd
import numpy as np
import math
from scipy.stats import binom
import os
import glob

In [4]:
def strand(df):
    if (df['fwd'] > (df['rev'])*2):
        return '+'
    elif (df['rev'] > (df['fwd'])*2):
        return '-'
    else:
        return '?'
def edit_match(df):
    if (df['strand']=='+'):
        return (df['fwd_miss']/(df['fwd']+df['fwd_miss']))
    elif (df['strand']=='-'):
        return (df['rev_miss']/(df['rev']+df['rev_miss']))
    else:
        return '?'
from scipy.special import betainc
def mins(df):
    if df['strand']=='+':
        return  (1- betainc(df['fwd_miss'], df['fwd'], 0.05))
    elif df['strand']=="-": 
        return (1- betainc(df['rev_miss'], df['rev'], 0.05))
def total(df):
    if df['strand']=='+':
        return  (1- betainc(df['total_miss'], df['total_match'], 0.05))
    elif df['strand']=="-": 
        return (1- betainc(df['total_miss'], df['total_match'], 0.05))
    else:
        return (1- betainc(df['total_miss'], df['total_match'], 0.05))

In [None]:
###Import pileup files (Split by chromosome and filtered for C or G in reference position using awk)

In [2]:
input_dir= '/Path to directory containing GC filtered mpileups'
beds = sorted(glob.glob(os.path.join(input_dir, '*')))

In [1]:
###Filters pileup files for C and G in reference position (redundant) and read coverage 10 then creates match and missmatch columns from I16 tag
###Concatenate chromosomes into single DF 
df=pd.DataFrame()
for i in beds:
    df_tmp=pd.read_csv(i,sep='\t', header=None)
    df_tmp=df_tmp[df_tmp[3].isin(['C','G'])]
    df_tmp['DP']=list(map(lambda x: str(x).split(';')[0], df_tmp[7]))
    df_tmp=df_tmp[df_tmp['DP']!="INDEL"]
    df_tmp['DP']=list(map(lambda x: str(x).split(';')[0].split('=')[1], df_tmp[7]))
    df_tmp=df_tmp[df_tmp['DP'].astype(int)>=10]
    df_tmp['fwd']=list(map(lambda x: str(x).split(';')[1].split('=')[1].split(',')[0], df_tmp[7]))
    df_tmp['fwd_miss']=list(map(lambda x: str(x).split(';')[1].split('=')[1].split(',')[2], df_tmp[7]))
    df_tmp['rev']=list(map(lambda x: str(x).split(';')[1].split('=')[1].split(',')[1], df_tmp[7]))
    df_tmp['rev_miss']=list(map(lambda x: str(x).split(';')[1].split('=')[1].split(',')[3], df_tmp[7]))
    df_tmp['DP']=df_tmp['DP'].astype(int)
    df_tmp['fwd']=df_tmp['fwd'].astype(int)
    df_tmp['fwd_miss']=df_tmp['fwd_miss'].astype(int)
    df_tmp['rev']=df_tmp['rev'].astype(int)
    df_tmp['rev_miss']=df_tmp['rev_miss'].astype(int)
    df_tmp=df_tmp[df_tmp[3].isin(['G','C'])]
    df=pd.concat([df,df_tmp])

NameError: name 'pd' is not defined

In [6]:
##Saves DF to csv for subsequent analysis (Save here to prevent loss if notebook crashes)
df.to_csv('/Path to save directory/Filename.csv')

In [7]:
##Assigns strand based on read alignments
df['strand'] = df.apply(strand, axis=1)

In [8]:
##Calculates edit fraction
df['fraction_match'] = df.apply(edit_match, axis=1)

In [9]:
##Get most common base mutation per site
df['miss_base']=list(map(lambda x: str(x).split(',')[0], df[4]))

In [10]:
df['key']=df[0]+":"+df[1].astype(str)+":"+df['strand']

In [12]:
df.fillna(0, inplace=True)

In [11]:
##Assign SAILOR confidence score per base WITH strand information (use for dRNA)
df['confidence']=df.apply(mins, axis=1)

In [17]:
##Assign SAILOR confidence score per base WITHOUT strand info (use for cDNA)
###Strand info will be gathered at the end of pipeline to reduce intersecting file size
df['total_match']=df['fwd']+df['rev']
df['total_miss']=df['fwd_miss']+df['rev_miss']
df['total_fraction']=df['total_miss']/(df['total_miss']+df['total_match'])
df['total_confidence']=df.apply(total, axis=1)

In [20]:
##Shift Base locations 0-based and Save (Use for subsequent notebooks)
df[1]=df[1].astype(int)-1
df['stop']=df[1].astype(int)+1
df.to_csv('/Path to save directory/Filename.csv')

In [15]:
##Saves a temp. bedgraph file
df[['0','1','stop','total_confidence']].to_csv('/Path to save directory/Sample_0.05_sailor.bedgraph', sep='\t',header=None, index=None)