# Demultiplexing ERA Raw Sequence Files for DADA2 Pipeline

### Set up workspace and read in mapping file.

* Mapping file could contain 3 columns:
    * Sample Name
    * Index 1 barcode 
    * Index 2 barcode

In [47]:
import pandas as pd
import screed
from itertools import izip
from collections import Counter
import gzip
import matplotlib.pyplot as plt
#import seaborn as sns
import pickle

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.glyphs import Circle
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool, BoxZoomTool, ResetTool, CustomJS, CrosshairTool
output_notebook()

%matplotlib inline

In [48]:
ir_df = pd.read_table("/home/bryan/PennRhiz/data/run1/Penn_v4bac_lib1_mappingfile.txt")

In [49]:
ir_df.head()

Unnamed: 0,#sampleID,SampleNumericID,Plate,primerFR_ID_byPlate,primerFR_ID_total,PrimerWell,FwdPrimerID,RevPrimerID,barcode_ID,fwd_barcode,rev_barcode,Experiment,TimePoint,PlotID
0,Penn-T1_1-4c_Nneg,D-105,1,1,1,A1,1,1,1,ATCGTACG,AACTCTCG,Penn,T1,1-4c_Nneg
1,Penn-T3_1-3a_Npos,318,1,2,2,B1,2,1,2,ACTATCTG,AACTCTCG,Penn,T3,1-3a_Npos
2,Penn-T4_1-2b_Npos,505,1,3,3,C1,3,1,3,TAGCGAGT,AACTCTCG,Penn,T4,1-2b_Npos
3,Penn-T1_1-3a_Nneg,D-81,1,4,4,D1,4,1,4,CTGCGTGT,AACTCTCG,Penn,T1,1-3a_Nneg
4,Penn-T3_1-4c_Nneg,341,1,5,5,E1,5,1,5,TCATCGAG,AACTCTCG,Penn,T3,1-4c_Nneg


In [50]:
# rev-comp functions
def complement(seq): 
    idx = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N'} 
    idx.update( {x.lower():y.lower() for x,y in idx.items()} )
    return ''.join(idx[x] for x in list(seq))

def revComp(seq):
    seq = seq[::-1]
    return complement(seq)  

In [51]:
Barcode_R = [(revComp(y)).upper() for y \
                         in ir_df.rev_barcode]
ir_df.rev_barcode = Barcode_R
ir_df.head()

Unnamed: 0,#sampleID,SampleNumericID,Plate,primerFR_ID_byPlate,primerFR_ID_total,PrimerWell,FwdPrimerID,RevPrimerID,barcode_ID,fwd_barcode,rev_barcode,Experiment,TimePoint,PlotID
0,Penn-T1_1-4c_Nneg,D-105,1,1,1,A1,1,1,1,ATCGTACG,CGAGAGTT,Penn,T1,1-4c_Nneg
1,Penn-T3_1-3a_Npos,318,1,2,2,B1,2,1,2,ACTATCTG,CGAGAGTT,Penn,T3,1-3a_Npos
2,Penn-T4_1-2b_Npos,505,1,3,3,C1,3,1,3,TAGCGAGT,CGAGAGTT,Penn,T4,1-2b_Npos
3,Penn-T1_1-3a_Nneg,D-81,1,4,4,D1,4,1,4,CTGCGTGT,CGAGAGTT,Penn,T1,1-3a_Nneg
4,Penn-T3_1-4c_Nneg,341,1,5,5,E1,5,1,5,TCATCGAG,CGAGAGTT,Penn,T3,1-4c_Nneg


## Check that sequences are in the same order between read and index files

Sequence names begin with the '@' symbol in each file. Check that the characters following the '@' symbol are identical between each file

In [52]:
!zcat /var/seq_data/PennRhiz/run1/index1.fq.gz  | head

@M02465:330:000000000-B2KV7:1:1101:15470:1337 1:N:0:0
TAGTCTCC
+
CDCCEFFF
@M02465:330:000000000-B2KV7:1:1101:15877:1342 1:N:0:0
TGAGTACG
+
BAABCFFC
@M02465:330:000000000-B2KV7:1:1101:16315:1343 1:N:0:0
ACGCTACT

gzip: stdout: Broken pipe


In [53]:
!zcat /var/seq_data/PennRhiz/run1/read1.fq.gz | head

@M02465:330:000000000-B2KV7:1:1101:15470:1337 1:N:0:0
TACAGAGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCCGCCTAAGTCAGACGTGAAATCCCTCGGCTCAACCGGGGAACTGCGTCTGATACTGGGGGGCTTGAATTCGGGAGAGGGATGCGGAATTCCAGGTGTAGCGGTGAAATGCGCAGATGTCAGGAGGAACACCCGTGGCGAAGGCGGCTCTCTGGGACGGTACTGACGCTGAGGCGCGAAAGCGTGGTGAGCGAACG
+
ABAAA5BAA2>DEFGFFGGGAGHFGGAFHHHGAFGHGGGGHHFFAA@EEEFGFGGEEEE@EGFFF44B33>CDCFHFHGHBC>DEGGHG?<>@DGG.FGFGC.A<C0DGG0=C<-=9B9A@A/99BEFDBBDDF;..;;;;;9;:;F/B;9;;FFFBB;;A/;FFFFFBB;F//99;F/.;;FFFFF.;;D?BBB;@DFBB-@./9/9;..;@?.9ABFBFADFFAA..;-;-9@ABA..-.9/:99-;9-
@M02465:330:000000000-B2KV7:1:1101:15877:1342 1:N:0:0
TACGAAGGGGGCTAGCGTTGCTCGGAATCACTGGGCGTAAAGCGCACGTAGGCGGATTGCTAAGTCAGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTTGATACTGGCGACCTTGAGTCCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACGGGCCCGGGACTGACGCTGAGGTGCGAAAGCGTGGGGAGCGAACG
+
3>AB3AAA2DDDGGGFEGGE?GGGGGEGHHGGFHBFEEGGHHBEEAE?GGHEB>E//?BGFFGGFFFBEFEGGHHHHHHEGG0BBGBGFFFGG1B10GGFG?1GHHB?GGFHHEG//<CGGFHF1

### Create new R1 and R2 files for each sample based on index barcodes

This block will read in your read1, read2, index1, and index2 files, compare barcode sequences to the provided mapping file, and create new files based on the sample names provided in the mapping file.

In [54]:
read1_fhs = dict([(s, gzip.open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/%s.R1.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])
read2_fhs = dict([(s, gzip.open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/%s.R2.fq.gz"%s, "w")) for s in ir_df["#sampleID"]])

idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["#sampleID"]) for i, row in ir_df.iterrows()])

path = "/var/seq_data/PennRhiz/run1/"
ir1 = screed.open(path + "index1.fq.gz")
ir2 = screed.open(path + "index2.fq.gz")
r1 = screed.open(path + "read1.fq.gz")
r2 = screed.open(path + "read2.fq.gz")

seq_cnt = Counter()
bc_cnt = Counter()
bad_bc_count = 0

for i, recs in enumerate(izip(ir1, ir2, r1, r2)):
    ir1 = recs[0]
    ir2 = recs[1]
    r1 = recs[2]
    r2 = recs[3]
        
    idx_key = ir1.sequence + ir2.sequence
    bc_cnt[idx_key] += 1
    
    try:
        sample_name = idx_d[idx_key]
    except KeyError:
        bad_bc_count += 1
        continue
    
    seq_cnt[sample_name] += 1
    
    fastq1 = "@%s\n%s\n+\n%s\n"%(r1.name, r1.sequence, r1.quality)
    read1_fhs[sample_name].write(fastq1)

    fastq2 = "@%s\n%s\n+\n%s\n"%(r2.name, r2.sequence, r2.quality)
    read2_fhs[sample_name].write(fastq2)
    
for i, j in zip(read1_fhs.itervalues(), read2_fhs.itervalues()):
    i.close(), j.close()

# not edited below here

In [55]:
print "We found %s barcodes that did not match the mapping file."%str(bad_bc_count)

We found 3095828 barcodes that did not match the mapping file.


In [56]:
pickle.dump(seq_cnt, open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/seq_cnt.pickle", "w"))
pickle.dump(bc_cnt, open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/bc_cnt.pickle", "w"))

In [57]:
seq_cnt = pickle.load(open("/home/bryan/PennRhiz/data/run1/v4bac_lib1_demult/seq_cnt.pickle"))

### Check Sequence counts per samples

At this point, the demultiplexing is complete, but you might want to check the total number of sequences obtained for each sample.

In [58]:
df = pd.DataFrame.from_dict(dict(seq_cnt.most_common()), orient="index")
df.rename(columns={0 : "count"}, inplace=True)
df.sort(columns="count", inplace=True, ascending=False)
df["color"] = "#F8766D"
df.reset_index(inplace=True)
df.rename(columns={"index" : "SampleID"}, inplace=True)
df["x"] = [i + 1 for i, s in enumerate(df["SampleID"])]
    
p = figure(width=800, height=400, y_axis_type = "log", 
           tools="", toolbar_location="left", 
           y_axis_label = "Seq count", x_axis_label = "Sample")

p.xaxis.axis_line_width = 3
p.yaxis.axis_line_width = 3
p.outline_line_color = None
p.grid.grid_line_color = None

source = ColumnDataSource(df)

invisible_circle = Circle(x='x', y='count', 
                          fill_color='color', 
                          fill_alpha=0.5, 
                          line_color="color", 
                          line_alpha = 0.5, size=8)

visible_circle = Circle(x='x', y='count', 
                        fill_color='color', 
                        fill_alpha=1.0, 
                        line_color="color")

cr = p.add_glyph(source, 
                 invisible_circle, 
                 selection_glyph=visible_circle, 
                 nonselection_glyph=invisible_circle)

l = p.line(x = df["x"], 
           y = df["count"], 
           line_width=3, 
           color='#F8766D')

code = "source.set('selected', cb_data['index']);"
callback = CustomJS(args={'source': source}, code=code)

tooltip = """
    <div>
        <span style="font-size: 17px; font-weight: bold;">@SampleID </span>
    </div>
    <div>
        <span style="font-size: 17px; font-weight: bold;">@count </span>
    </div>
"""

p.add_tools(HoverTool(tooltips=tooltip, callback=callback, renderers=[cr]),
            BoxZoomTool(dimensions=["width"]),
            ResetTool(),
            CrosshairTool(dimensions = ["height"]))

p.xaxis.major_label_text_color = "white"

sh = show(p), 

  app.launch_new_instance()


In [59]:
seq_cnt.most_common(192)

[('Penn-T4_1-2b_Npos', 68417),
 ('Penn-T1_1-4c_Nneg', 63274),
 ('Penn-T4_1-1b_Npos', 60554),
 ('Penn-T3_3-1c_Nneg', 58384),
 ('Penn-T3_1-4a_Nneg', 56278),
 ('Penn-T3_1-3c_Npos', 55617),
 ('Penn-T2_4-3c_Nneg', 53250),
 ('Penn-T3_1-4a_Npos', 53046),
 ('Penn-T4_3-4a_Npos', 52122),
 ('Penn-T4_1-4a_Npos', 51600),
 ('Penn-T1_3-1c_Nneg', 50959),
 ('Penn-T3_3-5a_Nneg', 49058),
 ('Penn-T3_1-4c_Nneg', 48928),
 ('Penn-T4_3-1b_Nneg', 48143),
 ('Penn-T1_1-1b_Nneg', 45768),
 ('Penn-T3_3-5a_Npos', 43265),
 ('Penn-T2_3-2c_Nneg', 41856),
 ('Penn-T2_1-5a_Nneg', 41766),
 ('Penn-T2_1-2a_Nneg', 40830),
 ('Penn-T1_1-3c_Nneg', 40249),
 ('Penn-T3_1-1b_Nneg', 39773),
 ('Penn-T3_3-1b_Npos', 39664),
 ('Penn-T4_3-2b_Nneg', 39629),
 ('Penn-T1_1-3a_Nneg', 39362),
 ('MockCommunity-1', 38978),
 ('Penn-T3_1-3a_Npos', 38793),
 ('Penn-T3_1-3b_Npos', 38436),
 ('Penn-T1_1-4b_Nneg', 38129),
 ('Penn-T3_4-4b_Npos', 37591),
 ('Penn-T3_1-3c_Nneg', 37308),
 ('Penn-T4_3-5a_Nneg', 37229),
 ('Penn-T4_1-3b_Nneg', 37044),
 ('Penn-T4

In [60]:
df[df["count"] < 200000].shape

(192, 4)