### Examples for working with seqtables with xarray ###

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.insert(0, '../../')
import pandas as pd
import numpy as np
from seqtables import seq_tables, read_sequences, insilica_sequences

  return f(*args, **kwds)


In [3]:
from sys import getsizeof
import time

In [4]:
def utf8len(s):
    return len(s.encode('utf-8'))

In [5]:
st, wt_seq = read_sequences.create_scratch_data(10000, 300, ss_pos=[5, 10, 15])

In [6]:
sequences = list(st.seq_arr.view('S300').astype('U').squeeze())
qualities = list(st.qual_arr.view('S300').astype('U').squeeze())

In [7]:
sequences[2] += 'N'
qualities[2] +=' !'
sequences[3] += 'ACCAAGA' + 'N' * 100
qualities[3] += '!!!!!!!' + '!' * 100

In [8]:
seq_arr = np.array(sequences, dtype='S').view('S1').reshape(len(sequences), -1).view(np.uint8)
qual_arr = np.array(qualities, dtype='S').view('S1').reshape(len(qualities), -1).view(np.uint8)
seq_arr[seq_arr == 0] = ord('N')
qual_arr[qual_arr == 0] = 33

In [9]:
st.seq_arr.astype('U').view('U1')

array([['A', 'A', 'G', ..., 'G', 'C', 'C'],
       ['A', 'A', 'G', ..., 'G', 'C', 'C'],
       ['A', 'A', 'G', ..., 'G', 'C', 'C'],
       ..., 
       ['A', 'A', 'G', ..., 'G', 'C', 'C'],
       ['A', 'A', 'G', ..., 'G', 'C', 'C'],
       ['A', 'A', 'G', ..., 'G', 'C', 'C']], 
      dtype='<U1')

In [10]:
seq_qual_arr = np.dstack([seq_arr, qual_arr - 33])

In [11]:
import xarray as xr

In [24]:
seq_xarray = xr.DataArray(
    seq_arr, 
    dims=['read', 'position'], 
    coords={
        'position': np.arange(1, seq_qual_arr.shape[1] + 1),
    }
                    
)
qual_xarray = xr.DataArray(
    qual_arr, 
    dims=['read', 'position'], 
    coords={
        'position': np.arange(100, 100 + qual_arr.shape[1])
    }
                    
)

In [29]:
p1 = seq_xarray.to_dataframe(name='seqs')

p2 = qual_xarray.to_dataframe(name='quals')
p3 = pd.concat([p1, p2]).stack()

In [132]:
midx = np.arange(1, seq_qual_arr.shape[1] + 1)
midx2 = pd.MultiIndex.from_product([['B'], np.arange(0, qual_arr.shape[1])])

In [141]:
np.zeros(0)

array([], dtype=float64)

In [144]:
mda = xr.DataArray(seq_arr, coords={'ref.position': midx, 'ref.read': []}, dims=['ref.read', 'ref.position'])
mdb = xr.DataArray(qual_arr, coords={'ref.position': midx}, dims=['ref.read', 'ref.position'])
seq_qual_xarray_set = xr.Dataset(
    data_vars = {
        'ref.sequence_table': mda,
        'ref.quality_table': mdb,
        'ref.insertion_table': []
    }
)

ValueError: dimensions ('ref.read', 'ref.position') must have the same length as the number of data dimensions, ndim=1

In [134]:
seq_qual_xarray_set

<xarray.Dataset>
Dimensions:              (ref.insertion_table: 0, ref.position: 407, ref.read: 10000)
Coordinates:
  * ref.position         (ref.position) int32 1 2 3 4 5 6 7 8 9 10 11 12 13 ...
  * ref.insertion_table  (ref.insertion_table) float64 
Dimensions without coordinates: ref.read
Data variables:
    ref.quality_table    (ref.read, ref.position) uint8 68 68 58 68 73 68 68 ...
    ref.sequence_table   (ref.read, ref.position) uint8 65 65 71 67 65 84 84 ...

In [18]:
seq_qual_xarray_set = xr.Dataset(
    data_vars = {
        'seq_array': seq_xarray,
        'qual_array': qual_xarray,
        'insertion_table': []
    }
)

In [24]:
seq_qual_xarray_set = xr.Dataset(
    data_vars = {
        'reference_table': seq_qual_xarray, 
        'insertion_table': []
    }
)

In [50]:
tmp = np.array([]).reshape(0,0)
tmp

array([], shape=(0, 0), dtype=float64)

In [21]:
seq_qual_xarray_set

<xarray.Dataset>
Dimensions:          (insertion_table: 0, position: 407, read: 10000, seqlet_qual: 2)
Coordinates:
  * seqlet_qual      (seqlet_qual) <U13 'seq_letter' 'quality_score'
  * position         (position) int32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * insertion_table  (insertion_table) float64 
Dimensions without coordinates: read
Data variables:
    reference_table  (read, position, seqlet_qual) uint8 71 35 71 35 65 30 ...

In [122]:
seq_qual_xarray[0:3, 0:5, 0].values.view('S1')

array([[b'G', b'G', b'G', b'T', b'T'],
       [b'G', b'G', b'G', b'T', b'C'],
       [b'G', b'G', b'G', b'T', b'G']], 
      dtype='|S1')

In [52]:
seq_arr.nbytes

5140000

In [41]:
%timeit seq_arr[seq_arr.view(np.uint8) == 0] = 'X'

100 loops, best of 3: 6.05 ms per loop


In [42]:
seq_arr

array([[b'G', b'G', b'G', ..., b'X', b'X', b'X'],
       [b'G', b'G', b'G', ..., b'X', b'X', b'X'],
       [b'G', b'G', b'G', ..., b'X', b'X', b'X'],
       ..., 
       [b'G', b'G', b'G', ..., b'X', b'X', b'X'],
       [b'G', b'G', b'G', ..., b'X', b'X', b'X'],
       [b'G', b'G', b'G', ..., b'X', b'X', b'X']], 
      dtype='|S1')

In [27]:
%timeit seq_arr.view(np.uint8)

The slowest run took 38.49 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 661 ns per loop


In [37]:
seqarr.view('S1').reshape(len(sequences), -1)[2:5]

array([[b'A', b'C', b'C', b'G', b'A', b'C', b'C', b'G', b'T', b'A', b'C',
        b'T', b'G', b'C', b'T', b'G', b'T', b'C', b'T', b'T', b'A', b'A',
        b'A', b'T', b'A', b'T', b'T', b'A', b'T', b'A', b'T', b'A', b'T',
        b'C', b'A', b'T', b'G', b'A', b'C', b'T', b'T', b'G', b'C', b'T',
        b'A', b'A', b'A', b'T', b'G', b'C', b'G', b'G', b'T', b'A', b'T',
        b'C', b'A', b'T', b'T', b'A', b'C', b'G', b'A', b'A', b'T', b'C',
        b'C', b'A', b'G', b'T', b'G', b'G', b'G', b'C', b'G', b'G', b'G',
        b'A', b'A', b'C', b'C', b'G', b'T', b'G', b'T', b'G', b'T', b'A',
        b'A', b'G', b'G', b'C', b'G', b'C', b'G', b'C', b'C', b'A', b'C',
        b'C', b'C', b'G', b'G', b'G', b'T', b'T', b'A', b'T', b'A', b'G',
        b'A', b'G', b'C', b'G', b'C', b'A', b'G', b'C', b'T', b'A', b'T',
        b'T', b'T', b'G', b'G', b'C', b'T', b'T', b'A', b'G', b'G', b'G',
        b'C', b'T', b'G', b'A', b'C', b'C', b'A', b'G', b'G', b'T', b'C',
        b'C', b'T', b'G', b'C', b'A', 