In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import sys
sys.path.insert(0, '../../')

from seqtables import seqtables
from seqtables.utils.insilica_sequences import generate_sequence
from seqtables.io import create_scratch_data


The pandas.tslib module is deprecated and will be removed in a future version.



In [3]:
from seqtables.utils import sam_tools

In [4]:
sample_sam_file = './fake_algn_data.sam'
df = sam_tools.read_sam(sample_sam_file, chunks=None)

In [5]:
dft = pd.concat([d for d in df])

In [6]:
from seqtables.internals import constructor_ops

In [7]:
arr = constructor_ops.strseries_to_bytearray(dft['seq'])

In [8]:
qt = constructor_ops.strseries_to_bytearray(dft['qual'])

In [9]:
import numpy as np

In [10]:
def sam_to_nparray(ref, cigar, pos, name, seqs_np_array, quals_np_array, min_pos = 1, max_pos = None):
    seqs_np_array = seqs_np_array.view('S1')
    quals_np_array = quals_np_array.view('S1')
    assert min_pos >= 1, 'Error: currently only allow base positions >= 1'
    c = sam_tools.cigar_breakdown(cigar)[0]
    if c[0][0] == 'S':
        seqs_np_array = seqs_np_array[c[0][1]:]
        quals_np_array = quals_np_array[c[0][1]:]
        c = c[1:]
    if c[-1][0] == 'S':
        seqs_np_array = seqs_np_array[:c[1][1]]
        quals_np_array = quals_np_array[:c[-1][1]]
        c = c[:-1]
    nums = seqs_np_array.shape[0]
    
    if pos > min_pos:        
        # add gaps at beginning of sequence
        c.insert(0, ('D', pos - min_pos))
        pos = min_pos

    pend = pos
    
    for (ctype, nume) in c:
        # determine end alignment position
        if ctype == 'M' or ctype == 'D':
            pend += nume
    # pend should be THE LAST base position (not the next base position)
    pend -= 1
    if max_pos is None:
        max_pos = pend
    if pend < max_pos:
        # add gaps at end of sequence
        c.append(('D', max_pos - pend))

    p = pos    
    i = 0
    algn_seq_arrs = []
    ins_seq = []
    ins_qual = []
    algn_qual_arrs = []
    
    for (ctype, nume) in c:                    
        if ctype == 'M':         
            algn_seq_arrs.append(seqs_np_array[:, i: i + nume])
            algn_qual_arrs.append(quals_np_array[:, i: i + nume])            
            p += nume
            i += nume
        elif ctype == 'D':
            algn_seq_arrs.append(np.repeat('-'.encode(), nums * nume).reshape(nums, nume))
            algn_qual_arrs.append(np.repeat('!'.encode(), nums * nume).reshape(nums, nume))
            p += nume            
        elif ctype == 'I':
            if p < min_pos or p > max_pos:
                # no need to store these insertions
                i += nume
                continue
            ins_seq.append(
                {
                    p: [name, seqs_np_array[:, i: i + nume]]
                }
            )
            ins_qual.append(
                {
                    p: [name, quals_np_array[:, i: i + nume]]
                }
            )
            i += nume
        else:
            raise Exception('Unexpected cigar string: ' + cigar)
    # p should be the last base position, not the next base position
    p -= 1
    algn_seq_arrs = np.hstack(algn_seq_arrs)
    algn_qual_arrs = np.hstack(algn_qual_arrs)
            
    column_names = np.arange(pos, p + 1)
    
    slice_these_cols = [i for i, pind in enumerate(column_names) if pind >= min_pos and pind <= max_pos]
    algn_seq_arrs = algn_seq_arrs[:, slice_these_cols]
    algn_qual_arrs = algn_qual_arrs[:, slice_these_cols]
    column_names = column_names[slice_these_cols]
    
    return algn_seq_arrs, algn_qual_arrs, ins_seq, ins_qual, column_names
            
            
            
    

In [39]:
def sam_to_nparray_2(ref, cigar, pos, seq, qual, min_pos = 1, max_pos = None):    
    c = sam_tools.cigar_breakdown(cigar)[0]
    s = ''
    q = ''
        
    if c[0][0] == 'S':
        seq = seq[c[0][1]:]
        qual = qual[c[0][1]:]
        c = c[1:]
    if c[-1][0] == 'S':
        seq = seq[:c[1][1]]
        qualy = qual[:c[-1][1]]
        c = c[:-1]
    
    if pos > min_pos:        
        # add gaps at beginning of sequence
        c.insert(0, ('D', pos - min_pos))
        pos = min_pos
        
    pend = pos
    
    for (ctype, nume) in c:
        # determine end alignment position
        if ctype == 'M' or ctype == 'D':
            pend += nume
            
    # pend should be THE LAST base position (not the next base position)
    pend -= 1
    if max_pos is None:
        max_pos = pend
    if pend < max_pos:
        # add gaps at end of sequence
        c.append(('D', max_pos - pend))

    p = pos    
    i = 0
    algn_seq_arrs = []
    ins_seq = []
    ins_qual = []
    algn_qual_arrs = []
    q = ''
    s = ''
    
    for (ctype, nume) in c:                    
        if ctype == 'M':   
            s += seq[i:i+nume]
            q += qual[i:i+nume]            
            p += nume
            i += nume
        elif ctype == 'D':
            s += '-'*nume
            q += '!'*nume            
            p += nume            
        elif ctype == 'I':
            if p < min_pos or p > max_pos:
                # no need to store these insertions
                i += nume
                continue
            #ins_seq.append(
            #    {
            #        p: [name, seqs_np_array[:, i: i + nume]]
            #    }
            #)
            #ins_qual.append(
            #    {
            #        p: [name, quals_np_array[:, i: i + nume]]
            #    }
            #)
            i += nume
        else:
            raise Exception('Unexpected cigar string: ' + cigar)
    # p should be the last base position, not the next base position
    # p -= 1
    # algn_seq_arrs = np.hstack(algn_seq_arrs)
    # algn_qual_arrs = np.hstack(algn_qual_arrs)
            
    # column_names = np.arange(pos, p + 1)
    
    # slice_these_cols = [i for i, pind in enumerate(column_names) if pind >= min_pos and pind <= max_pos]
    # algn_seq_arrs = algn_seq_arrs[:, slice_these_cols]
    # algn_qual_arrs = algn_qual_arrs[:, slice_these_cols]
    # column_names = column_names[slice_these_cols]
    
    return s, q #algn_seq_arrs, algn_qual_arrs, ins_seq, ins_qual, column_names
            
            
            
    

100

In [44]:
%timeit dft[['rname', 'cigar', 'pos', 'seq', 'qual']].apply(lambda x: sam_to_nparray_2(x[0], x[1], x[2], x[3], x[4]), axis=1)

1 loop, best of 3: 1.26 s per loop


In [None]:
dft.groupby(by=['rname', 'cigar', 'pos', 'seq', 'qual']).apply(
    lambda g: g.apply(lambda x: sam_to_nparray_2(x[0], x[1], x[2], x[3], x[4]), axis=1)
)

In [26]:
delme = np.arange(1, 500)
import time
t1 = time.time()
for i in range(1000000):
    delme[1:490]
t2 = time.time()
t2-t1
    

1.4191420078277588

In [None]:
import warnings
def sam_to_nparray(ref, cigar, pos, name, min_pos = None, max_pos = None, adjust_softclipping_to_pos=False):
    # Return:
        # indices of aligned sequences
        # indices of inserted bases
        # column names
    
    cb = sam_tool.cigar_breakdown(cigar)
    c = cb[0]
    c_evt = cb[1]
    
    
    if min_pos is None:
        min_pos = pos

    sp = pos
    ep = pos + c_evt['D'] - c_evt['I']
    ind_p = sum(c_evt.values()) - c_evt['D']
    
        
    if max_pos is None:
        max_pos = ep

    col_names = np.arange(min_pos, max_pos + 1)
    ind_names =np.arrange(0, ind_p + 1)
    
    p = sp
    ind = 0
    
    pos_ind = []
    arr_ind = []
    
    for e, (ctype, nume) in enumerate(c):
        if ctype == 'S':
            # softclipping, remove bases from sequence of interest, dont change reference position
            ind += nume
        elif ctype == 'M':
            # match or mismatch bases, add to the indicies, adjust reference position
            stop_p = p + num_e
            
            
            if p >= min_pos:
            elif p + nume >= min_pos:
                
            pos_ind.append(
                
            )
            p += nume
            ind += nume            
        elif ctype == 'D':
            # deletions 
    
    
    
    
    
        
    
    
    
    pos_ind = []
    arr_ind = []
    
    p = pos  # index wrt reference sequence
    ind = 0  # index wrt read
    
    if p < min_pos:
    
    for e, (ctype, nume) in enumerate(c):
        if ctype == 'S':
            if e == 0:
                # 5' softclipping
                if adjust_softclipping_to_pos and pos > min_pos:
                    # need to adjust softclipping to add more bases that should align to reference and not be softclipped
                    adjusted_softclipping = min((pos - min_pos), nume)
                    p = pos - adjusted_softclipping
                
                    
                else:
                    # skip these indices
                    ind += nume
                
                    
            elif e == len(c):
            else:
                warnings.warn('Unusual place for softclipping?? ' + cigar)
                continue
        elif 
        
        elif ctype == 'S' and e == len(c):
        elif ctype == 'S'
    
    3
    1 5 => 4
    
    if c[0][0] == 'S':
        if adjust_softclipping_to_pos and pos > min_pos:
            # need to adjust softclipping to add more bases that should align to reference and not be softclipped
            adjusted_softclipping = min((pos - min_pos), c[0][1])
            pos -= adjusted_softclipping
            c = [('M', adjusted_softclipping)] + c[1:]
        else:
            c = c[1:]
            
    if c[-1][0] == 'S':
        seqs_np_array = seqs_np_array[:c[1][1]]
        quals_np_array = quals_np_array[:c[-1][1]]
        c = c[:-1]
    nums = seqs_np_array.shape[0]
    
    if pos > min_pos:        
        # add gaps at beginning of sequence
        c.insert(0, ('D', pos - min_pos))
        pos = min_pos

    pend = pos
    
    for (ctype, nume) in c:
        # determine end alignment position
        if ctype == 'M' or ctype == 'D':
            pend += nume
    # pend should be THE LAST base position (not the next base position)
    pend -= 1
    if max_pos is None:
        max_pos = pend
    if pend < max_pos:
        # add gaps at end of sequence
        c.append(('D', max_pos - pend))

    p = pos    
    i = 0
    algn_seq_arrs = []
    ins_seq = []
    ins_qual = []
    algn_qual_arrs = []
    
    for (ctype, nume) in c:                    
        if ctype == 'M':         
            algn_seq_arrs.append(seqs_np_array[:, i: i + nume])
            algn_qual_arrs.append(quals_np_array[:, i: i + nume])            
            p += nume
            i += nume
        elif ctype == 'D':
            algn_seq_arrs.append(np.repeat('-'.encode(), nums * nume).reshape(nums, nume))
            algn_qual_arrs.append(np.repeat('!'.encode(), nums * nume).reshape(nums, nume))
            p += nume            
        elif ctype == 'I':
            if p < min_pos or p > max_pos:
                # no need to store these insertions
                i += nume
                continue
            ins_seq.append(
                {
                    p: [name, seqs_np_array[:, i: i + nume]]
                }
            )
            ins_qual.append(
                {
                    p: [name, quals_np_array[:, i: i + nume]]
                }
            )
            i += nume
        else:
            raise Exception('Unexpected cigar string: ' + cigar)
    # p should be the last base position, not the next base position
    p -= 1
    algn_seq_arrs = np.hstack(algn_seq_arrs)
    algn_qual_arrs = np.hstack(algn_qual_arrs)
            
    column_names = np.arange(pos, p + 1)
    
    slice_these_cols = [i for i, pind in enumerate(column_names) if pind >= min_pos and pind <= max_pos]
    algn_seq_arrs = algn_seq_arrs[:, slice_these_cols]
    algn_qual_arrs = algn_qual_arrs[:, slice_these_cols]
    column_names = column_names[slice_these_cols]
    
    return algn_seq_arrs, algn_qual_arrs, ins_seq, ins_qual, column_names
            
            
            
    

In [307]:
import xarray as xr
arrdf = pd.DataFrame(arr.view(np.uint8), index=dft.index)
qualdf = pd.DataFrame(qt.view(np.uint8), index=dft.index)

In [309]:
tmp = pd.concat([dft[['rname', 'cigar', 'pos']], arrdf, qualdf], axis=1, keys=['sam', 'seq', 'qual'])

In [373]:
import time
t1 = time.time()
tmp.groupby(by=[('sam', 'rname'), ('sam', 'cigar'), ('sam', 'pos')]).apply(
    lambda grp: sam_to_nparray(
        grp.name[0], grp.name[1], grp.name[2], 
        grp.index, grp['seq'].values, grp['qual'].values
    )
)
t2 = time.time()
t2-t1

29.32545018196106

In [389]:
t1 = time.time()
sam_to_nparray(
    'wtref',
    '10S190M3D1DI395M',
    1,
    tmp.index,
    tmp['seq'].values,
    tmp['qual'].values
) 
t2= time.time()
t2-t1

0.032501220703125

In [396]:
t1 = time.time()
tmp.apply(
    lambda x: sam_to_nparray(
        x[('sam', 'rname')],
        x[('sam', 'cigar')],
        x[('sam', 'pos')],
        x.index,
        x['seq'].values,
        x['qual'].values
    ),
    axis=1
)
t2 = time.time()
t2-t1

TypeError: ('Cannot change data-type for object array.', 'occurred at index 0')

In [337]:
test.index

Int64Index([774, 1789, 2721, 4984, 6809, 8091, 8878, 9569], dtype='int64')

In [369]:
ttt = tmp.groupby(by=[('sam', 'rname'), ('sam', 'cigar'), ('sam', 'pos')])
test = ttt.get_group(list(ttt.groups.keys())[5])
sam_to_nparray(
    'wtref',
    '202M1D95M',
    1,
    test.index,
    test['seq'].values,
    test['qual'].values
)[0].shape

(1, 298)

In [None]:

sam_to_nparray(
    
)

In [297]:
pd.Dqt

(10000, 297)

In [286]:
a1 = sam_to_nparray(
    tmp['rname'], tmp['cigar'], tmp['pos'], 
    tmp.name, arr[0].reshape(1, -1), qt[0].reshape(1, -1),
    max_pos=350, min_pos=290
)

In [287]:
a1[0]

array([[b'A', b'T', b'A', b'C', b'G', b'G', b'T', b'C', b'T', b'G', b'T',
        b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
        b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
        b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
        b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-',
        b'-', b'-', b'-', b'-', b'-', b'-']], 
      dtype='|S1')

In [221]:
a1[0]

array([[b'C', b'C', b'G', b'C', b'A', b'A', b'A', b'A', b'G', b'C', b'C',
        b'C', b'A', b'A', b'A', b'A', b'A', b'A', b'G', b'C', b'T', b'C',
        b'T', b'A', b'C', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'C',
        b'A', b'A', b'C', b'G', b'T', b'G', b'C', b'T', b'T', b'G', b'C',
        b'T', b'C', b'G', b'C', b'C', b'T', b'C', b'T', b'T', b'T', b'C',
        b'G', b'G', b'T', b'C', b'T', b'A', b'C', b'C', b'T', b'C', b'G',
        b'G', b'C', b'A', b'C', b'T', b'A', b'T', b'A', b'A', b'G', b'G',
        b'T', b'C', b'T', b'C', b'G', b'A', b'G', b'C', b'C', b'G', b'G',
        b'G', b'C', b'A', b'T', b'T', b'C', b'C', b'T', b'G', b'G', b'G',
        b'G', b'A', b'T', b'C', b'G', b'A', b'G', b'T', b'T', b'C', b'A',
        b'T', b'C', b'C', b'T', b'C', b'A', b'-']], 
      dtype='|S1')

In [151]:
a1[0][2].shape

(1, 67)

In [50]:
%timeit seqtables.seq_tables.SeqTable(dft['seq'])

10 loops, best of 3: 20.7 ms per loop


In [56]:
1e6/dft.shape[0]*20.7/1000

2.07

(5000, 10)