In [1]:
from pathlib import Path
import pandas as pd
from pybedtools import BedTool
import matplotlib.pyplot as plt
import seaborn as sns


In [51]:
from scipy.stats import binom
fname = '/home/hsher/scratch/circ_nextera_iter13/output/edits/RBM15_STAMP.dp4.neg.vcf.tsv'
f_handle = open('/home/hsher/scratch/circ_nextera_iter13/output/edits/RBM15_STAMP.dp4.neg.vcf.snpfilter.tsv',
               'w')
chunk = 0
for total_df in pd.read_csv(fname,
            sep = '\t', chunksize = 1000000):
    df = total_df.loc[(total_df['TYPE']=='MIXED')] # only mixed had alt allele
    ad_col = df.columns[df.columns.str.endswith('.AD')][0]
    
    n_ref = df[ad_col].apply(lambda s: int(s.split(',')[0]))
    n_alt = df[ad_col].apply(lambda s: int(s.split(',')[1]))
    total = n_ref+n_alt
    
    stat = pd.DataFrame([total, n_alt], index = ['total', 'alt']).T
    
    stat['p_hetero']=stat.apply(lambda row: binom.pmf(n=row['total'],
                                                      k=row['alt'], p = 0.5), axis = 1)
    
    stat['p_homo']=stat.apply(lambda row: binom.pmf(n=row['total'],
                                                      k=row['alt'], p = 0.99), axis = 1) # 1% seq error
    
    # remove those look like SNPs
    to_remove = stat.loc[(stat['p_hetero']>0.05)|(stat['p_homo']>0.05)].index
    
    
    total_df.drop(to_remove, axis = 0, inplace = True)
    
    if chunk==0:
        total_df.to_csv(f_handle, sep = '\t', header = True, index = False)
    else:
        total_df.to_csv(f_handle, sep = '\t', header = False, index = False)
        
    chunk += 1

In [32]:
df['ALT'].value_counts()

<*>        999353
A,<*>         411
T,<*>         127
C,<*>         103
C,A,<*>         3
T,A,<*>         2
A,C,<*>         1
Name: ALT, dtype: int64

In [4]:
df['CHROM'].iloc[-1]

'chr1:226286333|226297813'

In [26]:
def aggregate_counts(df):
    # the reference is repeating the sequence twice to represent BSJ
    df['length']=df['CHROM'].apply(lambda string: int(string.split('|')[1])-int(string.split('|')[0].split(':')[1])+1)
    df['POS']=df['POS']%df['length']

    ad_col = df.columns[df.columns.str.endswith('.AD')][0]

    df['n_ref'] = df[ad_col].apply(lambda s: int(s.split(',')[0]))
    df['n_alt'] = df[ad_col].apply(lambda s: int(s.split(',')[1]))
    df['pos_id']=df['CHROM']+':'+df['POS'].astype(str)

    aggregated_df = df.groupby(by = 'pos_id')[['n_ref', 'n_alt']].sum()
    aggregated_df = aggregated_df.loc[aggregated_df.sum(axis = 1)>0]
    return aggregated_df

In [34]:
df

Unnamed: 0,CHROM,POS,REF,ALT,FILTER,TYPE,DP,output/circ/RBM15_STAMP_denovo.sorted.bam.AD,output/circ/RBM15_STAMP_denovo.sorted.bam.ADF,output/circ/RBM15_STAMP_denovo.sorted.bam.ADR,length,n_ref,n_alt,pos_id
41000000,chr1:172068819|172084707,9829,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9829
41000001,chr1:172068819|172084707,9835,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9835
41000002,chr1:172068819|172084707,9841,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9841
41000003,chr1:172068819|172084707,9849,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9849
41000004,chr1:172068819|172084707,9852,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40999995,chr1:172068819|172084707,9810,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9810
40999996,chr1:172068819|172084707,9811,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9811
40999997,chr1:172068819|172084707,9820,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9820
40999998,chr1:172068819|172084707,9825,G,<*>,PASS,SYMBOLIC,0,00,00,00,15889,0,0,chr1:172068819|172084707:9825


In [33]:
aggregate_counts(df)

Unnamed: 0_level_0,n_ref,n_alt
pos_id,Unnamed: 1_level_1,Unnamed: 2_level_1
chr1:100196214|100206636:10,1,0
chr1:100196214|100206636:100,32,0
chr1:100196214|100206636:10024,8,0
chr1:100196214|100206636:10031,8,0
chr1:100196214|100206636:10032,7,0
...,...,...
chr1:99990903|100017815:9537,2,0
chr1:99990903|100017815:9542,2,0
chr1:99990903|100017815:9543,2,0
chr1:99990903|100017815:9545,2,0


In [39]:
outf_nonzero

'/home/hsher/scratch/circ_nextera_iter13/output/edits/RBM15_STAMP.dp4.neg.combined.vcf.nonzero.tsv'

In [37]:
import numpy as np
from scipy.stats import binom
from collections import defaultdict
import pandas as pd
fname = '/home/hsher/scratch/circ_nextera_iter13/output/edits/RBM15_STAMP.dp4.neg.vcf.tsv'
outf = '/home/hsher/scratch/circ_nextera_iter13/output/edits/RBM15_STAMP.dp4.neg.combined.vcf.tsv'
outf_nonzero = outf.replace('.tsv', '.nonzero.tsv')
alt = 'A'
aggregated_counts = defaultdict(lambda: np.array([0,0]))


previous_chunk = pd.DataFrame()
i = 0
with open(outf, 'w') as fhandle:
    with open(outf_nonzero, 'w') as fhandle_nonzero:
        for df in pd.read_csv(fname,
                    sep = '\t', chunksize = 1000000):
            
            df = df.loc[(df['ALT']=='<*>')|(df['ALT']==f'{alt},<*>')]

            last_circ = df['CHROM'].iloc[-1]
            process_next_chunk = df.loc[df['CHROM']==last_circ].copy()

            # process the last circular RNA in next chunck, and add back the ones from previous chunk
            df = df.loc[df['CHROM']!=last_circ]
            print(df.shape[0], previous_chunk.shape[0], process_next_chunk.shape[0])
            df = pd.concat([df,previous_chunk], axis = 0)
            print(df.shape[0])

            aggregated_df = aggregate_counts(df)
            if i == 0:
                aggregated_df.to_csv(fhandle, sep = '\t', index = True, header = True)
                aggregated_df.loc[aggregated_df['n_alt']>0].to_csv(
                    fhandle_nonzero, sep = '\t', index = True, header = True)
            else:
                aggregated_df.to_csv(fhandle, sep = '\t', index = True, header = False)
                aggregated_df.loc[aggregated_df['n_alt']>0].to_csv(
                    fhandle_nonzero, sep = '\t', index = True, header = False)

            previous_chunk = process_next_chunk.copy()

            if i>10:
                break
            i+=1

        # process the last chunk
        aggregated_df = aggregate_counts(previous_chunk)
        aggregated_df.to_csv(fhandle, sep = '\t', index = True, header = False)
        aggregated_df.loc[aggregated_df['n_alt']>0].to_csv(
            fhandle_nonzero, sep = '\t', index = True, header = False)

SYMBOLIC    999353
MIXED          647
Name: TYPE, dtype: int64
<*>        999353
A,<*>         411
T,<*>         127
C,<*>         103
C,A,<*>         3
T,A,<*>         2
A,C,<*>         1
Name: ALT, dtype: int64
997781 0 1983
997781
SYMBOLIC    999023
MIXED          975
SNP              2
Name: TYPE, dtype: int64
<*>        999023
A,<*>         576
T,<*>         202
C,<*>         186
A,C,<*>         3
T,A,<*>         2
C,A,<*>         2
T,C,<*>         2
C,T,<*>         1
C,A,T           1
T,A,C           1
A,T,<*>         1
Name: ALT, dtype: int64
998517 1983 1082
1000500
SYMBOLIC    999267
MIXED          733
Name: TYPE, dtype: int64
<*>        999267
A,<*>         469
T,<*>         154
C,<*>         103
C,A,<*>         3
T,C,<*>         2
A,C,<*>         1
A,T,<*>         1
Name: ALT, dtype: int64
991670 1082 8066
992752
SYMBOLIC    999219
MIXED          781
Name: TYPE, dtype: int64
<*>        999219
A,<*>         485
T,<*>         158
C,<*>         136
C,A,<*>         2
Name: ALT, 

In [40]:
last_circ

'chr1:35381259|35399576'

In [31]:
df.loc[df['TYPE']=='MIXED']

Unnamed: 0,CHROM,POS,REF,ALT,FILTER,TYPE,DP,output/circ/RBM15_STAMP_denovo.sorted.bam.AD,output/circ/RBM15_STAMP_denovo.sorted.bam.ADF,output/circ/RBM15_STAMP_denovo.sorted.bam.ADR,length,n_ref,n_alt,pos_id


In [25]:
aggregated_df['n_alt'].max()

0

In [16]:
df.groupby(by = 'pos_id')[['n_ref', 'n_alt']].sum()

Unnamed: 0_level_0,n_ref,n_alt
pos_id,Unnamed: 1_level_1,Unnamed: 2_level_1
chr1:100058666|100061949:0,20,0
chr1:100058666|100061949:1,21,0
chr1:100058666|100061949:10,23,0
chr1:100058666|100061949:1014,0,0
chr1:100058666|100061949:1015,0,0
...,...,...
chr1:99715586|99729748:9855,4,0
chr1:99715586|99729748:9869,4,0
chr1:99715586|99729748:989,1,0
chr1:99715586|99729748:990,1,0


In [9]:
last_circ_data

Unnamed: 0,CHROM,POS,REF,ALT,FILTER,TYPE,DP,output/circ/RBM15_STAMP_denovo.sorted.bam.AD,output/circ/RBM15_STAMP_denovo.sorted.bam.ADF,output/circ/RBM15_STAMP_denovo.sorted.bam.ADR
998017,chr1:29160757|29168638,728,G,<*>,PASS,SYMBOLIC,1,10,10,00
998018,chr1:29160757|29168638,735,G,<*>,PASS,SYMBOLIC,1,10,10,00
998019,chr1:29160757|29168638,740,G,<*>,PASS,SYMBOLIC,1,10,10,00
998020,chr1:29160757|29168638,742,G,<*>,PASS,SYMBOLIC,1,10,10,00
998021,chr1:29160757|29168638,750,G,<*>,PASS,SYMBOLIC,1,10,10,00
...,...,...,...,...,...,...,...,...,...,...
999995,chr1:29160757|29168638,14564,G,<*>,PASS,SYMBOLIC,2,20,00,20
999996,chr1:29160757|29168638,14569,G,<*>,PASS,SYMBOLIC,2,20,00,20
999997,chr1:29160757|29168638,14570,G,<*>,PASS,SYMBOLIC,2,20,00,20
999998,chr1:29160757|29168638,14580,G,<*>,PASS,SYMBOLIC,0,00,00,00


In [None]:
import pickle



with open(outf, 'wb') as handle:
    pickle.dump(aggregated_counts, handle)

In [None]:
aggregated_counts['GL000195.1:48955|49986:329']

array([2, 3])

KeyboardInterrupt: 

In [4]:
ythdf2.loc[ythdf2['TYPE']=='MIXED']

Unnamed: 0,CHROM,POS,REF,ALT,FILTER,TYPE,DP,AD,ADF,ADR
382,chr5:134910335|134970092,10715,G,"T,<*>",PASS,MIXED,1,010,010,000
443,chr5:134910335|134970092,13050,G,"A,<*>",PASS,MIXED,3,030,030,000
452,chr5:134910335|134970092,13068,G,"A,<*>",PASS,MIXED,5,050,050,000
497,chr5:134910335|134970092,13299,G,"T,<*>",PASS,MIXED,4,040,040,000
516,chr5:134910335|134970092,13401,G,"T,<*>",PASS,MIXED,8,080,080,000
...,...,...,...,...,...,...,...,...,...,...
4036,chr5:134910335|134970092,77354,G,"A,<*>",PASS,MIXED,22,0220,0220,000
4052,chr5:134910335|134970092,77410,G,"A,<*>",PASS,MIXED,260,02410,01130,01280
4095,chr5:134910335|134970092,77568,G,"A,<*>",PASS,MIXED,4,040,000,040
4116,chr5:134910335|134970092,77619,G,"A,<*>",PASS,MIXED,60,2520,0190,2330
