*Note that this notebook needs to run in phython 2.7, something in the demultiplexing stage fails in python 3.6

# Demultiplexing Raw Sequence Files for Uploading to NCBI

### Set up workspace and read in mapping file.

* Mapping file could contain 3 columns:
    * Sample Name
    * Index 1 barcode 
    * Index 2 barcode

In [1]:
workDir = '/home/bryan/RhizCG/data/v4demultiplexed/'
varSeqDir = '/var/seq_data/RhizCG/150520_run1/raw/'
databaseDir = '/home/bryan/RhizCG/data/databases/'

seqFile = 'pear_merged-2015-06-01.assembled.dmult.fastq'
nprocs = 20

In [2]:
import pandas as pd
import screed
#from itertools import izip
# yoavram: Bug fix for Python 3 as suggested in https://github.com/nschloe/matplotlib2tikz/issues/20
try:
    from itertools import izip
except ImportError:
    izip = zip
from collections import Counter
import gzip
import matplotlib.pyplot as plt
#import seaborn as sns
import pickle

# from bokeh.io import output_notebook, show
# from bokeh.plotting import figure
# from bokeh.models.glyphs import Circle
# from bokeh.models import ColumnDataSource
# from bokeh.models import HoverTool, BoxZoomTool, ResetTool, CustomJS, CrosshairTool
# output_notebook()

%matplotlib inline

In [3]:
ir_df = pd.read_table("/home/bryan/RhizCG/data/MiSeq_amplicon/DeMultiplexMappingFile-Lib1.txt")

In [4]:
ir_df.head()

Unnamed: 0,#sampleID,Library,fwd_barcode,rev_barcode
0,RhizCG-Neg_Control,1,GGATATCT,TCGCTATA
1,RhizCG-Neg_Control_b,1,GATCGTGT,TCATAGAC
2,RhizCG-S1-1.27_Nneg,1,ATCGTACG,AACTCTCG
3,RhizCG-S1-1.27_Npos,1,ACTATCTG,AACTCTCG
4,RhizCG-S1-2.05_Nneg,1,ATCGTACG,GTCGTAGT


In [5]:
# rev-comp functions
def complement(seq): 
    idx = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N'} 
    idx.update( {x.lower():y.lower() for x,y in idx.items()} )
    return ''.join(idx[x] for x in list(seq))

def revComp(seq):
    seq = seq[::-1]
    return complement(seq)  

In [7]:
Barcode_R = [(revComp(y)).upper() for y \
                         in ir_df.rev_barcode]
ir_df.rev_barcode = Barcode_R
ir_df.head()

Unnamed: 0,#sampleID,Library,fwd_barcode,rev_barcode
0,RhizCG-Neg_Control,1,GGATATCT,TATAGCGA
1,RhizCG-Neg_Control_b,1,GATCGTGT,GTCTATGA
2,RhizCG-S1-1.27_Nneg,1,ATCGTACG,CGAGAGTT
3,RhizCG-S1-1.27_Npos,1,ACTATCTG,CGAGAGTT
4,RhizCG-S1-2.05_Nneg,1,ATCGTACG,ACTACGAC


# Library1-run1

## Check that sequences are in the same order between read and index files

Sequence names begin with the '@' symbol in each file. Check that the characters following the '@' symbol are identical between each file

In [8]:
!zcat /home/seq_data/RhizCG/150520_run1/raw/RhizCG-Library1_run1_index1.fq.gz | head

@M02465:150:000000000-AE9NC:1:1101:13459:1675 1:N:0:0
TTCTTTTT
+
1>1>1@1@
@M02465:150:000000000-AE9NC:1:1101:15650:1731 1:N:0:0
TCTACGAC
+
3>AABBBC
@M02465:150:000000000-AE9NC:1:1101:13843:1740 1:N:0:0
TTTCCTAC

gzip: stdout: Broken pipe


In [9]:
!zcat /home/seq_data/RhizCG/150520_run1/raw/RhizCG-Library1_run1_read1.fq.gz | head

@M02465:150:000000000-AE9NC:1:1101:13459:1675 1:N:0:0
TTCTGTCTTTTCGTATGCAGGGCGTTGTGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGAATTGGCACAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGTATATAAACATTTTTCTCTTTTTTTTCTTTTTTTTTCTTTCTCTCTTCTTTTTACTTTTTT
+
A11>1B@FFFFFEFA11331110A0A000BB2A0DA/D21AA1D2DFB2A22BA21///A/BB111AGACHHFGF21B?FCGFG0FD2B@FGHGD@GHGHFHGHH2B1>BFGGDBGGBGGB1FHHHE1E0F<FFF///C0FFCCECGFBD1FG/0?GF1<>>-.<@-<<C=C.<:;00-;-G:0C00;CF.00;0;000000000/9000009---/9////---9/////;/;/9//;////-9/9//;-
@M02465:150:000000000-AE9NC:1:1101:15650:1731 1:N:0:0
TCCTTCGGTGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCCTTCTAAGTCAGACGTGAAATCCCCCGGCTTAACCTGGGAACTGCGTCTGATACTGGGAGGCTAGAGTATGGGAGAGGGAGGTAGACTTCCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAGGACCACCGGTGGCGAAGGCGGCATCCTGGACCGGTATTGACGCTGAGGCGCGAAAGCCAGGGGAGCAAACG
+
>1A1>@>AAA?B1B1A1EEAFCD00B0BABADDFGFEFA/FGDF///AABD1AAE@/E1@BB@FGE2B11>F//FGBGCFFEEC>>>11BDGHGGCGFHHECGGGGFFGGHHGHFFCC?1?1?FH

### Create new R1 and R2 files for each sample based on index barcodes

This block will read in your read1, read2, index1, and index2 files, compare barcode sequences to the provided mapping file, and create new files based on the sample names provided in the mapping file.

In [10]:
idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["#sampleID"]) for i, row in ir_df.iterrows()])

In [11]:
read1_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/run1/%s.R1.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])
read2_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/run1/%s.R2.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])

idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["#sampleID"]) for i, row in ir_df.iterrows()])

path = "/home/seq_data/RhizCG/150520_run1/raw/"
ir1 = screed.open(path + "RhizCG-Library1_run1_index1.fq.gz")
ir2 = screed.open(path + "RhizCG-Library1_run1_index2.fq.gz")
r1 = screed.open(path + "RhizCG-Library1_run1_read1.fq.gz")
r2 = screed.open(path + "RhizCG-Library1_run1_read2.fq.gz")

seq_cnt = Counter()
bc_cnt = Counter()
bad_bc_count = 0

for i, recs in enumerate(izip(ir1, ir2, r1, r2)):
    ir1 = recs[0]
    ir2 = recs[1]
    r1 = recs[2]
    r2 = recs[3]
        
    idx_key = ir1.sequence + ir2.sequence
    bc_cnt[idx_key] += 1
    
    try:
        sample_name = idx_d[idx_key]
    except KeyError:
        bad_bc_count += 1
        continue
    
    seq_cnt[sample_name] += 1
    
    fastq1 = "@%s\n%s\n+\n%s\n"%(r1.name, r1.sequence, r1.quality)
    read1_fhs[sample_name.encode()].write(fastq1)

    fastq2 = "@%s\n%s\n+\n%s\n"%(r2.name, r2.sequence, r2.quality)
    read2_fhs[sample_name].write(fastq2)
    
for i, j in zip(read1_fhs.itervalues(), read2_fhs.itervalues()):
    i.close(), j.close()

In [16]:
!zcat /home/bryan/RhizCG/data/v4demultiplexed/run1/RhizCG-S1-1.27_Nneg.R1.fq.gz | head -8

@M02465:150:000000000-AE9NC:1:1101:15879:1880 1:N:0:0
TACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTAAGACAGGTGTGAAATCCCCGGGCTTAACCTGGGAACTGCGCTTGTGACTGCAAGACTCGAGTACGGCAGAGGGGGGTAGAATTCCACGTGTAGCAGTGAAATGCGTAGAGATGGGGAGGAATAGCGATGGCGAAGGCAGCCCCCTGGGGCGATACTGGCGCTCGTGCACGAAAGCGGGGGGAGCAAACG
+
AAA?ABC?A>>AA2AEAAE225DAEA2AFBDGBBEGEAA1ABGE1B?A00A>B>?>>BF1244@4BB2?B@3?4?4FGF3FC?E/</B2BBBFAA02/2FG///?F?F11?F0?DB0??1?<ADHHGC--<..E?CFA9.9//9FFBFFDEF/:99/99B//;BBBD.;@.9/;//;9.9-..////--99..9---..9.99..-9.?.-.9--:9;B/;B.;9-9;..;9..;./.-9;->AF..:/9.
@M02465:150:000000000-AE9NC:1:1101:11677:2022 1:N:0:0
TACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTCGTTGTGTCCGCGGTGAAATCCCCCGGCTCCACCCCGGACTGGCGCGTGAAACTGTCCGACTGGAGTGCGGGCCAGGGGGGGGGAATTCCCCGTGGAGCGGGGGAATGCGTAGAGAGTGGGAGGAAGACCCGCGGGGAAGGCGGCCCCCCGGGCCGGCTCTGGCGGCCAGGGTCGGAGGCCGGGGGCTCACGAC
+
>1AA1BB?AFA>1AAA0AE00FD00A0ABEFFFFEAEAEEFHFF/A/EA//>EE@>>/EEG0/?FB1>//>EE/112BEGC/<//>/<?FG//>@//>.><-.-<-..<0<<000..-::;/:/:

# library 1 run1-rerun

In [17]:
read1_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/run1-rerun//%s.R1.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])
read2_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/run1-rerun//%s.R2.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])

idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["#sampleID"]) for i, row in ir_df.iterrows()])

path = "/home/seq_data/RhizCG/150522_run1_rerun/raw/"
ir1 = screed.open(path + "RhizCG-Library1_rerun_index1.fq.gz")
ir2 = screed.open(path + "RhizCG-Library1_rerun_index2.fq.gz")
r1 = screed.open(path + "RhizCG-Library1_rerun_read1.fq.gz")
r2 = screed.open(path + "RhizCG-Library1_rerun_read2.fq.gz")

seq_cnt = Counter()
bc_cnt = Counter()
bad_bc_count = 0

for i, recs in enumerate(izip(ir1, ir2, r1, r2)):
    ir1 = recs[0]
    ir2 = recs[1]
    r1 = recs[2]
    r2 = recs[3]
        
    idx_key = ir1.sequence + ir2.sequence
    bc_cnt[idx_key] += 1
    
    try:
        sample_name = idx_d[idx_key]
    except KeyError:
        bad_bc_count += 1
        continue
    
    seq_cnt[sample_name] += 1
    
    fastq1 = "@%s\n%s\n+\n%s\n"%(r1.name, r1.sequence, r1.quality)
    read1_fhs[sample_name.encode()].write(fastq1)

    fastq2 = "@%s\n%s\n+\n%s\n"%(r2.name, r2.sequence, r2.quality)
    read2_fhs[sample_name].write(fastq2)
    
for i, j in zip(read1_fhs.itervalues(), read2_fhs.itervalues()):
    i.close(), j.close()

In [18]:
print "We found %s barcodes that did not match the mapping file."%str(bad_bc_count)

We found 3249535 barcodes that did not match the mapping file.


# Library 2

In [6]:
ir_df = pd.read_table("/home/bryan/RhizCG/data/MiSeq_amplicon/DeMultiplexMappingFile-Lib2.txt")

In [7]:
ir_df.head()

Unnamed: 0,#sampleID,Library,fwd_barcode,rev_barcode
0,RhizCG-S4-4.27_Npos,2,ATCGTACG,AACTCTCG
1,RhizCG-S3-4.27_Npos,2,ACTATCTG,AACTCTCG
2,RhizCG-S5-2.05_Npos,2,TAGCGAGT,AACTCTCG
3,RhizCG-S1-1.06_Nneg,2,CTGCGTGT,AACTCTCG
4,RhizCG-S4-1.06_Npos,2,TCATCGAG,AACTCTCG


In [8]:
Barcode_R = [(revComp(y)).upper() for y \
                         in ir_df.rev_barcode]
ir_df.rev_barcode = Barcode_R
ir_df.head()

Unnamed: 0,#sampleID,Library,fwd_barcode,rev_barcode
0,RhizCG-S4-4.27_Npos,2,ATCGTACG,CGAGAGTT
1,RhizCG-S3-4.27_Npos,2,ACTATCTG,CGAGAGTT
2,RhizCG-S5-2.05_Npos,2,TAGCGAGT,CGAGAGTT
3,RhizCG-S1-1.06_Nneg,2,CTGCGTGT,CGAGAGTT
4,RhizCG-S4-1.06_Npos,2,TCATCGAG,CGAGAGTT


In [9]:
!zcat /home/seq_data/RhizCG/150615_run2/raw/RhizCG-Library2_run1_Index1.fq.gz | head

@M01032:290:000000000-AFRDE:1:1101:15801:1331 1:N:0:0
TATAGCGA
+
CCCDDFED
@M01032:290:000000000-AFRDE:1:1101:15566:1331 1:N:0:0
CGAGAGTT
+
CCCBCCFF
@M01032:290:000000000-AFRDE:1:1101:15506:1337 1:N:0:0
CGAGAATT

gzip: stdout: Broken pipe


In [10]:
!zcat /home/seq_data/RhizCG/150615_run2/raw/RhizCG-Library2_run1_read1.fq.gz | head

@M01032:290:000000000-AFRDE:1:1101:15801:1331 1:N:0:0
TACGGCGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCACGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTCGACCCTGGCAGGCTAGAGTCTTGTAGAGGGGGGTAGACTTCCAGGTGTAGCGGTGACATGCGTAGAGACATGGAGGACTACCGGTGGCGCAGCGGGCCCCCTGGTCAACGCCTGACGCGCAGGAGTGAAAGCGAGGGGAGTGACCA
+
11A111>>D/>>BA1///A/A2D/B/ABB1BDF1A/>/>//@@////>////>//?<EGEEB>DGFEGCC?GF221B>1FC></>>/<?<??00>FFEHF1>11>11.--<G.</.-..0;0:;C00G00<0;/C---9-99/;/;;;/B://:BB--;;EF//:9;--;-9/-;/;B/9---99/-@;@@A->-@=--------9@-E9//9/----;-:-;9--------/9//9--;-9--;--//9/
@M01032:290:000000000-AFRDE:1:1101:15566:1331 1:N:0:0
TCCGTAGGGGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCCCGTGTAGGCGGCCAGATAAGTCCGCTGTGAATTCTCGAGGATCACCTTCCAGCTGTCGGCGGCAACTGTCTGGCTAGAGTCCGGAAGAGGCGAATGGAGTTCCCGGTGTAGCGGTGCGATGCGCAGATATCCGGACGTACACCAGTGGCGCCGGAGGTTCTCTGGGACGGACATGTCGATGCTCACAGAAAGACTGGTGATGGGACG
+
111>ACAAAA@00A00AEEEEDA00//AB1DA11/BAA///210AAEAFBDGE>/>E/>01@1B@B1?///?0222BBB>F1///B<000BBC0110BG11?C/</---.>C=DFBC0<DG0<=<

In [None]:
read1_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/lib2run1/%s.R1.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])
read2_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/lib2run1/%s.R2.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])

idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["#sampleID"]) for i, row in ir_df.iterrows()])

path = "/home/seq_data/RhizCG/150615_run2/raw/"
ir1 = screed.open(path + "RhizCG-Library2_run1_Index1.fq.gz")
ir2 = screed.open(path + "RhizCG-Library2_run1_Index2.fq.gz")
r1 = screed.open(path + "RhizCG-Library2_run1_read1.fq.gz")
r2 = screed.open(path + "RhizCG-Library2_run1_read1.fq.gz")

seq_cnt = Counter()
bc_cnt = Counter()
bad_bc_count = 0

for i, recs in enumerate(izip(ir1, ir2, r1, r2)):
    ir1 = recs[0]
    ir2 = recs[1]
    r1 = recs[2]
    r2 = recs[3]
        
    idx_key = ir1.sequence + ir2.sequence
    bc_cnt[idx_key] += 1
    
    try:
        sample_name = idx_d[idx_key]
    except KeyError:
        bad_bc_count += 1
        continue
    
    seq_cnt[sample_name] += 1
    
    fastq1 = "@%s\n%s\n+\n%s\n"%(r1.name, r1.sequence, r1.quality)
    read1_fhs[sample_name.encode()].write(fastq1)

    fastq2 = "@%s\n%s\n+\n%s\n"%(r2.name, r2.sequence, r2.quality)
    read2_fhs[sample_name].write(fastq2)
    
for i, j in zip(read1_fhs.itervalues(), read2_fhs.itervalues()):
    i.close(), j.close()

In [29]:
print "We found %s barcodes that did not match the mapping file."%str(bad_bc_count)

We found 1906597 barcodes that did not match the mapping file.


# lib2 rerun

In [11]:
!zcat /home/seq_data/RhizCG/150622_run2_rerun/raw/RhizCG-Library2_rerun_Index1.fq.gz | head

@M01032:293:000000000-AG0VY:1:1101:16244:1345 1:N:0:0
TAGTCTCC
+
CCCCCFFF
@M01032:293:000000000-AG0VY:1:1101:16047:1346 1:N:0:0
TATAGCGA
+
BBAABBDB
@M01032:293:000000000-AG0VY:1:1101:15548:1347 1:N:0:0
CGAGAGTT

gzip: stdout: Broken pipe


In [12]:
!zcat /home/seq_data/RhizCG/150622_run2_rerun/raw/RhizCG-Library2_rerun_read1.fq.gz | head

@M01032:293:000000000-AG0VY:1:1101:16244:1345 1:N:0:0
TACGGAGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGCGGCGCGGGGAGTCGGGGGTGAAAGGCCGGGGCTCAACCCCGGATCGGCGGTCGAGACTGCCGGGCTGGAGGACGGGAGGGGAGGGGGGGATGCTGGGTGGGGGGGGGGAATGCGAAGAGAGAGGGAGGTACAACGGGGGCGAGGGCGGGTCGTGGGGGGGGGGATGAAGGCGGGGGGCGAAAGAGGGGGGGGGGGAGG
+
AAAAAAADD>>>EEEAEFGGGF1BAEEF1GFGHHH/EFGGGHHGGGGGEEGGGGE/>/>///>>C<AC@-<;A.:CG.;;C@AG@9EFGF:..;@@-9;A@--;;;@-;-/99B;-99-AAAA9-9@@-9@-@@-99--;@---;-/9:;/9----;;9>9---9/;--;AA----:B-;9-/9///9@-;@-9--;;9;99--;--;-;--9-----://;/-;-;;-9-9--;A/9-9=@@-;---99-
@M01032:293:000000000-AG0VY:1:1101:16047:1346 1:N:0:0
GACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCGGGGCTCAACCTGGGAACTGCATTCGAAACTGGCAGGCTAGAGTCCGGTAGAGGGGGGTAGAAATCCAGGTGGCGCGGGGAAATGCGTAGAGCTCTGGAGGAACACCGGGGGCGAAGGCGGCAGGCTGGAGAAGAAATGACGCAGCGGGGAGATAGCGGGGGGAGCGAAAC
+
1AAAAAADD>>AEGGGGGCGAFGGFBCFBFGHHHHHGGGGHH1BE/EE/AEEGGG///BB2FFGGGAEE>>BF>FF2FCG/>EE/CHHHBH0<FFF1/?GG11?@@F0?<FGB/?ACFFGFHF><

In [13]:
read1_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/lib2rerun/%s.R1.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])
read2_fhs = dict([(s, gzip.open("/home/bryan/RhizCG/data/v4demultiplexed/lib2rerun/%s.R2.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])

idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["#sampleID"]) for i, row in ir_df.iterrows()])

path = "/home/seq_data/RhizCG/150622_run2_rerun/raw/"
ir1 = screed.open(path + "RhizCG-Library2_rerun_Index1.fq.gz")
ir2 = screed.open(path + "RhizCG-Library2_rerun_Index2.fq.gz")
r1 = screed.open(path + "RhizCG-Library2_rerun_read1.fq.gz")
r2 = screed.open(path + "RhizCG-Library2_rerun_read2.fq.gz")

seq_cnt = Counter()
bc_cnt = Counter()
bad_bc_count = 0

for i, recs in enumerate(izip(ir1, ir2, r1, r2)):
    ir1 = recs[0]
    ir2 = recs[1]
    r1 = recs[2]
    r2 = recs[3]
        
    idx_key = ir1.sequence + ir2.sequence
    bc_cnt[idx_key] += 1
    
    try:
        sample_name = idx_d[idx_key]
    except KeyError:
        bad_bc_count += 1
        continue
    
    seq_cnt[sample_name] += 1
    
    fastq1 = "@%s\n%s\n+\n%s\n"%(r1.name, r1.sequence, r1.quality)
    read1_fhs[sample_name.encode()].write(fastq1)

    fastq2 = "@%s\n%s\n+\n%s\n"%(r2.name, r2.sequence, r2.quality)
    read2_fhs[sample_name].write(fastq2)
    
for i, j in zip(read1_fhs.itervalues(), read2_fhs.itervalues()):
    i.close(), j.close()

In [34]:
print "We found %s barcodes that did not match the mapping file."%str(bad_bc_count)

We found 3146059 barcodes that did not match the mapping file.


In [56]:
pickle.dump(seq_cnt, open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/seq_cnt.pickle", "w"))
pickle.dump(bc_cnt, open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/bc_cnt.pickle", "w"))

In [57]:
seq_cnt = pickle.load(open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/seq_cnt.pickle"))

### Check Sequence counts per samples

At this point, the demultiplexing is complete, but you might want to check the total number of sequences obtained for each sample.

In [58]:
df = pd.DataFrame.from_dict(dict(seq_cnt.most_common()), orient="index")
df.rename(columns={0 : "count"}, inplace=True)
df.sort(columns="count", inplace=True, ascending=False)
df["color"] = "#F8766D"
df.reset_index(inplace=True)
df.rename(columns={"index" : "SampleID"}, inplace=True)
df["x"] = [i + 1 for i, s in enumerate(df["SampleID"])]
    
p = figure(width=800, height=400, y_axis_type = "log", 
           tools="", toolbar_location="left", 
           y_axis_label = "Seq count", x_axis_label = "Sample")

p.xaxis.axis_line_width = 3
p.yaxis.axis_line_width = 3
p.outline_line_color = None
p.grid.grid_line_color = None

source = ColumnDataSource(df)

invisible_circle = Circle(x='x', y='count', 
                          fill_color='color', 
                          fill_alpha=0.5, 
                          line_color="color", 
                          line_alpha = 0.5, size=8)

visible_circle = Circle(x='x', y='count', 
                        fill_color='color', 
                        fill_alpha=1.0, 
                        line_color="color")

cr = p.add_glyph(source, 
                 invisible_circle, 
                 selection_glyph=visible_circle, 
                 nonselection_glyph=invisible_circle)

l = p.line(x = df["x"], 
           y = df["count"], 
           line_width=3, 
           color='#F8766D')

code = "source.set('selected', cb_data['index']);"
callback = CustomJS(args={'source': source}, code=code)

tooltip = """
    <div>
        <span style="font-size: 17px; font-weight: bold;">@SampleID </span>
    </div>
    <div>
        <span style="font-size: 17px; font-weight: bold;">@count </span>
    </div>
"""

p.add_tools(HoverTool(tooltips=tooltip, callback=callback, renderers=[cr]),
            BoxZoomTool(dimensions=["width"]),
            ResetTool(),
            CrosshairTool(dimensions = ["height"]))

p.xaxis.major_label_text_color = "white"

sh = show(p), 

  app.launch_new_instance()


In [59]:
seq_cnt.most_common(192)

[('Penn-T4_1-2b_Npos', 68417),
 ('Penn-T1_1-4c_Nneg', 63274),
 ('Penn-T4_1-1b_Npos', 60554),
 ('Penn-T3_3-1c_Nneg', 58384),
 ('Penn-T3_1-4a_Nneg', 56278),
 ('Penn-T3_1-3c_Npos', 55617),
 ('Penn-T2_4-3c_Nneg', 53250),
 ('Penn-T3_1-4a_Npos', 53046),
 ('Penn-T4_3-4a_Npos', 52122),
 ('Penn-T4_1-4a_Npos', 51600),
 ('Penn-T1_3-1c_Nneg', 50959),
 ('Penn-T3_3-5a_Nneg', 49058),
 ('Penn-T3_1-4c_Nneg', 48928),
 ('Penn-T4_3-1b_Nneg', 48143),
 ('Penn-T1_1-1b_Nneg', 45768),
 ('Penn-T3_3-5a_Npos', 43265),
 ('Penn-T2_3-2c_Nneg', 41856),
 ('Penn-T2_1-5a_Nneg', 41766),
 ('Penn-T2_1-2a_Nneg', 40830),
 ('Penn-T1_1-3c_Nneg', 40249),
 ('Penn-T3_1-1b_Nneg', 39773),
 ('Penn-T3_3-1b_Npos', 39664),
 ('Penn-T4_3-2b_Nneg', 39629),
 ('Penn-T1_1-3a_Nneg', 39362),
 ('MockCommunity-1', 38978),
 ('Penn-T3_1-3a_Npos', 38793),
 ('Penn-T3_1-3b_Npos', 38436),
 ('Penn-T1_1-4b_Nneg', 38129),
 ('Penn-T3_4-4b_Npos', 37591),
 ('Penn-T3_1-3c_Nneg', 37308),
 ('Penn-T4_3-5a_Nneg', 37229),
 ('Penn-T4_1-3b_Nneg', 37044),
 ('Penn-T4

In [60]:
df[df["count"] < 200000].shape

(192, 4)