### Imports

In [2]:
#!/usr/bin/env python
import itertools
import logging
import os
import sys

from configparser import ConfigParser
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import seaborn as sns
import scipy

gitpath=os.path.expanduser("~/git/mapseq-processing")
sys.path.append(gitpath)

from mapseq.core import *
from mapseq.barcode import *
from mapseq.utils import *
from mapseq.stats import *

gitpath=os.path.expanduser("~/git/mapseq-analysis")
sys.path.append(gitpath)

from msanalysis.analysis import *

print('Done')

Done


### Configuration

In [11]:
logging.getLogger().setLevel(logging.INFO)
vbctable = os.path.expanduser('~/project/mapseq/M282.nextseq/vbctable.out/M282.vbctable.tsv')
conffile = os.path.expanduser('~/project/mapseq/M282.nextseq/M282.nextseq.conf' )
cp = ConfigParser()
cp.read(conffile)

<configparser.ConfigParser at 0x33906ce90>

### Load data and config

In [12]:
vbcdf = load_mapseq_df( vbctable, fformat='vbctable', use_dask=False)
vbcdf

INFO:root:loading /Users/hover/project/mapseq/M282.nextseq/vbctable.out/M282.vbctable.tsv as format vbctable use_dask=False chunksize=50000000


Unnamed: 0,vbc_read_col,label,type,umi_count,read_count,brain,region,site
0,AAAAAAACAGCTAAAGAATCCTTGTTCACC,BC206,lone,1,4,,,target-lone
1,AAAAAAACCGGCCTTGTACTTGGTTCTCTT,BC203,real,3,44,6.0,,target
2,AAAAAAACCTGGGCCCGTTAAGTCACGTTT,BC206,lone,13,156,,,target-lone
3,AAAAAAACTATCTATGAACTATTGTTATTA,BC206,lone,8,86,,,target-lone
4,AAAAAAAGAGGAACATTGTGCTTCTAGCAA,BC206,lone,3,31,,,target-lone
...,...,...,...,...,...,...,...,...
644949,TTTTTTTGTGAACCCGGCCTTTTCAAGTAT,BC206,lone,6,76,,,target-lone
644950,TTTTTTTGTGGTAGAATTCTAATCCGAACA,BC206,real,1,6,,,target-lone
644951,TTTTTTTGTGTATTAGTCGCCGTGTTGGTC,BC199,real,1,10,6.0,,target
644952,TTTTTTTGTGTATTAGTCGCCGTGTTGGTC,BC203,real,23,230,6.0,,target


In [13]:
require_injection = cp.getboolean('matrices','require_injection')
inj_min_umi = int(cp.get('matrices','inj_min_umi'))
target_min_umi = int(cp.get('matrices','target_min_umi'))
target_min_umi_absolute = int(cp.get('matrices','target_min_umi_absolute'))
use_target_negative=cp.getboolean('matrices','use_target_negative')
use_target_water_control=cp.getboolean('matrices','use_target_water_control')
print(f'target_min_umi={target_min_umi}\ninj_min_umi={inj_min_umi} \nuse_target_negative={use_target_negative}\nuse_target_water_control={use_target_water_control}')


target_min_umi=2
inj_min_umi=10 
use_target_negative=False
use_target_water_control=False


### Characterize data

In [22]:
tn = len(vbcdf)
print(f'total vbcs: {tn}')
print('\noverall type counts:')
vtypes = list( vbcdf['type'].unique())
for t in vtypes:
    n = len(vbcdf[ vbcdf['type'] == t ])
    p = n / tn
    print(f'{t}\t:\t{n} ({p:.2f}) ')

print('\noverall site counts:')
for s in vbcdf['site'].unique(): 
    n = len( vbcdf[ vbcdf['site'] == s ])
    p = n / tn
    print(f'{s}\t:\t{n} ({p:.2f}) ')

print('\ntype-specific counts:')
for vt in vtypes:
    sdf = vbcdf[ vbcdf['type'] == vt ]
    sn = len(sdf)
    print(f'{vt}: {sn}')
    for s in sdf['site'].unique(): 
        n = len( sdf[ sdf['site'] == s ])
        p = n / sn
        print(f'      {s} :\t{n} ({p:.2f}) ')
    print('')

total vbcs: 644954

overall type counts:
lone	:	537550 (0.83) 
real	:	97807 (0.15) 
spike	:	9597 (0.01) 

overall site counts:
target-lone	:	529407 (0.82) 
target	:	105610 (0.16) 
target-negative-bio	:	5245 (0.01) 
target-negative	:	4028 (0.01) 
target-water-control	:	664 (0.00) 

type-specific counts:
lone: 537550
      target-lone :	524843 (0.98) 
      target :	10197 (0.02) 
      target-negative-bio :	1148 (0.00) 
      target-negative :	1254 (0.00) 
      target-water-control :	108 (0.00) 

real: 97807
      target :	88133 (0.90) 
      target-negative-bio :	3226 (0.03) 
      target-negative :	2347 (0.02) 
      target-lone :	3799 (0.04) 
      target-water-control :	302 (0.00) 

spike: 9597
      target :	7280 (0.76) 
      target-lone :	765 (0.08) 
      target-negative-bio :	871 (0.09) 
      target-negative :	427 (0.04) 
      target-water-control :	254 (0.03) 



In [26]:
print(f'target_min_umi = {target_min_umi}')
tdf = vbcdf[vbcdf['umi_count'] > target_min_umi]
tdf.groupby(['label','site'], observed=True).count()['umi_count']

target_min_umi = 2


label  site                
BC193  target                     583
BC194  target                     316
BC195  target                     499
BC196  target                     577
BC197  target                   22785
BC198  target                    1049
BC199  target                     904
BC200  target                     657
BC201  target                     526
BC202  target                    1375
BC203  target                    7079
BC204  target-negative-bio        655
BC205  target-negative            532
BC206  target-lone             259423
BC207  target-water-control        20
Name: umi_count, dtype: int64