In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import sys
sys.path.insert(0, '../../')

from seqtables import seqtables
from seqtables.utils.insilica_sequences import generate_sequence
from seqtables.io import create_scratch_data

In [2]:
from seqtables.internals import sam_to_arr

go2o2


In [3]:
from seqtables.utils import sam_tools

In [5]:
sam_to_arr.tounicode('aacca')

'aac'

In [4]:
sample_sam_file = './fake_algn_data.sam'
df = sam_tools.read_sam(sample_sam_file, chunks=None)

In [7]:
dft = pd.concat([d for d in df])

In [8]:
from seqtables.internals import constructor_ops
arr = constructor_ops.strseries_to_bytearray(dft['seq'])
qt = constructor_ops.strseries_to_bytearray(dft['qual'])

In [5]:
import numpy as np

In [13]:
def sam_to_nparray(ref, cigar, pos, name, seqs_np_array, quals_np_array, min_pos = 1, max_pos = None):
    seqs_np_array = seqs_np_array.view('S1')
    quals_np_array = quals_np_array.view('S1')
    assert min_pos >= 1, 'Error: currently only allow base positions >= 1'
    c = sam_tools.cigar_breakdown(cigar)[0]
    if c[0][0] == 'S':
        seqs_np_array = seqs_np_array[c[0][1]:]
        quals_np_array = quals_np_array[c[0][1]:]
        c = c[1:]
    if c[-1][0] == 'S':
        seqs_np_array = seqs_np_array[:c[1][1]]
        quals_np_array = quals_np_array[:c[-1][1]]
        c = c[:-1]
    nums = seqs_np_array.shape[0]
    
    if pos > min_pos:        
        # add gaps at beginning of sequence
        c.insert(0, ('D', pos - min_pos))
        pos = min_pos

    pend = pos
    
    for (ctype, nume) in c:
        # determine end alignment position
        if ctype == 'M' or ctype == 'D':
            pend += nume
    # pend should be THE LAST base position (not the next base position)
    pend -= 1
    if max_pos is None:
        max_pos = pend
    if pend < max_pos:
        # add gaps at end of sequence
        c.append(('D', max_pos - pend))

    p = pos    
    i = 0
    algn_seq_arrs = []
    ins_seq = []
    ins_qual = []
    algn_qual_arrs = []
    
    for (ctype, nume) in c:                    
        if ctype == 'M':         
            algn_seq_arrs.append(seqs_np_array[:, i: i + nume])
            algn_qual_arrs.append(quals_np_array[:, i: i + nume])            
            p += nume
            i += nume
        elif ctype == 'D':
            algn_seq_arrs.append(np.repeat('-'.encode(), nums * nume).reshape(nums, nume))
            algn_qual_arrs.append(np.repeat('!'.encode(), nums * nume).reshape(nums, nume))
            p += nume            
        elif ctype == 'I':
            if p < min_pos or p > max_pos:
                # no need to store these insertions
                i += nume
                continue
            ins_seq.append(
                {
                    p: [name, seqs_np_array[:, i: i + nume]]
                }
            )
            ins_qual.append(
                {
                    p: [name, quals_np_array[:, i: i + nume]]
                }
            )
            i += nume
        else:
            raise Exception('Unexpected cigar string: ' + cigar)
    # p should be the last base position, not the next base position
    p -= 1
    algn_seq_arrs = np.hstack(algn_seq_arrs)
    algn_qual_arrs = np.hstack(algn_qual_arrs)
            
    column_names = np.arange(pos, p + 1)
    
    slice_these_cols = [i for i, pind in enumerate(column_names) if pind >= min_pos and pind <= max_pos]
    algn_seq_arrs = algn_seq_arrs[:, slice_these_cols]
    algn_qual_arrs = algn_qual_arrs[:, slice_these_cols]
    column_names = column_names[slice_these_cols]
    
    return algn_seq_arrs, algn_qual_arrs, ins_seq, ins_qual, column_names
            
            
            
    

In [14]:
def sam_to_nparray_2(ref, cigar, pos, seq, qual, min_pos = 1, max_pos = None):    
    c = sam_tools.cigar_breakdown(cigar)[0]
    s = ''
    q = ''
        
    if c[0][0] == 'S':
        seq = seq[c[0][1]:]
        qual = qual[c[0][1]:]
        c = c[1:]
    if c[-1][0] == 'S':
        seq = seq[:c[1][1]]
        qualy = qual[:c[-1][1]]
        c = c[:-1]
    
    if pos > min_pos:        
        # add gaps at beginning of sequence
        c.insert(0, ('D', pos - min_pos))
        pos = min_pos
        
    pend = pos
    
    for (ctype, nume) in c:
        # determine end alignment position
        if ctype == 'M' or ctype == 'D':
            pend += nume
            
    # pend should be THE LAST base position (not the next base position)
    pend -= 1
    if max_pos is None:
        max_pos = pend
    if pend < max_pos:
        # add gaps at end of sequence
        c.append(('D', max_pos - pend))

    p = pos    
    i = 0
    algn_seq_arrs = []
    ins_seq = []
    ins_qual = []
    algn_qual_arrs = []
    q = ''
    s = ''
    
    for (ctype, nume) in c:                    
        if ctype == 'M':   
            s += seq[i:i+nume]
            q += qual[i:i+nume]            
            p += nume
            i += nume
        elif ctype == 'D':
            s += '-'*nume
            q += '!'*nume            
            p += nume            
        elif ctype == 'I':
            if p < min_pos or p > max_pos:
                # no need to store these insertions
                i += nume
                continue
            #ins_seq.append(
            #    {
            #        p: [name, seqs_np_array[:, i: i + nume]]
            #    }
            #)
            #ins_qual.append(
            #    {
            #        p: [name, quals_np_array[:, i: i + nume]]
            #    }
            #)
            i += nume
        else:
            raise Exception('Unexpected cigar string: ' + cigar)
    # p should be the last base position, not the next base position
    # p -= 1
    # algn_seq_arrs = np.hstack(algn_seq_arrs)
    # algn_qual_arrs = np.hstack(algn_qual_arrs)
            
    # column_names = np.arange(pos, p + 1)
    
    # slice_these_cols = [i for i, pind in enumerate(column_names) if pind >= min_pos and pind <= max_pos]
    # algn_seq_arrs = algn_seq_arrs[:, slice_these_cols]
    # algn_qual_arrs = algn_qual_arrs[:, slice_these_cols]
    # column_names = column_names[slice_these_cols]
    
    return s, q #algn_seq_arrs, algn_qual_arrs, ins_seq, ins_qual, column_names
            
            
            
    

In [18]:
%timeit dft[['rname', 'cigar', 'pos', 'seq', 'qual']].apply(lambda x: sam_to_nparray_2(x[0], x[1], x[2], x[3], x[4]), axis=1)

1 loop, best of 3: 730 ms per loop


In [23]:
import time
t1 =time.time()
dft[['rname', 'cigar', 'pos', 'seq', 'qual']].groupby(by=['rname', 'cigar', 'pos']).apply(
    lambda g: g.apply(lambda x: sam_to_nparray_2(x[0], x[1], x[2], x[3], x[4]), axis=1)
)
t2 = time.time()
t2-t1

4.8242881298065186