*Note that this notebook needs to run in phython 2.7, something in the demultiplexing stage fails in python 3.6

# Demultiplexing Raw Sequence Files
## Set up workspace and read in mapping file.
Mapping file could contain 3 columns:
SampleID
    rev_barcode
fwd_barcode

In [5]:
import pandas as pd
import screed
#from itertools import izip
# yoavram: Bug fix for Python 3 as suggested in https://github.com/nschloe/matplotlib2tikz/issues/20
try:
    from itertools import izip
except ImportError:
    izip = zip
from collections import Counter
import gzip
import matplotlib.pyplot as plt
#import seaborn as sns
import pickle


In [2]:
# temporary home for sequence data in home directory


In [6]:
workDir = '/home/be68/Hyphosphere/data/MiSeq/20181126/demultiplexed'
seqDir = '/home/be68/Hyphosphere/data/MiSeq/20181126/'
#varSeqDir = '/var/seq_data/RhizCG/150520_run1/raw/'
#databaseDir = '/home/bryan/RhizCG/data/databases/'


nprocs = 3

In [7]:
ir_df = pd.read_table("/home/be68/Hyphosphere/Exp10/Exp10MappingFile.txt")

In [8]:
ir_df.head()

Unnamed: 0,SampleID,Sample,Note,Primer_link,fwd_barcode,rev_barcode
0,1_3.1RH,3.1RH,Kozich Primer Plate 1 (Primer 1-96) must be used!,1,ATCGTACG,AACTCTCG
1,2_3.1BP,3.1BP,Kozich Primer Plate 1 (Primer 1-96) must be used!,2,ACTATCTG,AACTCTCG
2,3_2.1CH,2.1CH,Kozich Primer Plate 1 (Primer 1-96) must be used!,3,TAGCGAGT,AACTCTCG
3,4_2.1BP,2.1BP,Kozich Primer Plate 1 (Primer 1-96) must be used!,4,CTGCGTGT,AACTCTCG
4,5_1.1RH,1.1RH,Kozich Primer Plate 1 (Primer 1-96) must be used!,5,TCATCGAG,AACTCTCG


In [9]:
# rev-comp functions
def complement(seq): 
    idx = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N'} 
    idx.update( {x.lower():y.lower() for x,y in idx.items()} )
    return ''.join(idx[x] for x in list(seq))

def revComp(seq):
    seq = seq[::-1]
    return complement(seq)

In [10]:
Barcode_R = [(revComp(y)).upper() for y \
                         in ir_df.rev_barcode]
ir_df.rev_barcode = Barcode_R
ir_df.head()

Unnamed: 0,SampleID,Sample,Note,Primer_link,fwd_barcode,rev_barcode
0,1_3.1RH,3.1RH,Kozich Primer Plate 1 (Primer 1-96) must be used!,1,ATCGTACG,CGAGAGTT
1,2_3.1BP,3.1BP,Kozich Primer Plate 1 (Primer 1-96) must be used!,2,ACTATCTG,CGAGAGTT
2,3_2.1CH,2.1CH,Kozich Primer Plate 1 (Primer 1-96) must be used!,3,TAGCGAGT,CGAGAGTT
3,4_2.1BP,2.1BP,Kozich Primer Plate 1 (Primer 1-96) must be used!,4,CTGCGTGT,CGAGAGTT
4,5_1.1RH,1.1RH,Kozich Primer Plate 1 (Primer 1-96) must be used!,5,TCATCGAG,CGAGAGTT


# Check that sequences are in the same order between read and index files
* Sequence names begin with the '@' symbol in each file. Check that the characters following the '@' symbol are identical between each file



In [11]:
!zcat /home/be68/Hyphosphere/data/MiSeq/20181126/86058_C4M53_Exp10_16Slib1_S1_I1_001.fastq.gz | head

@M02465:481:000000000-C4M53:1:2105:15715:1330 1:Y:0:1
NTCTTTCT
+
#11>>133
@M02465:481:000000000-C4M53:1:2105:15528:1332 1:Y:0:1
NTTTTTTT
+
#1111111
@M02465:481:000000000-C4M53:1:2105:15763:1333 1:Y:0:1
NTTTCTTC

gzip: stdout: Broken pipe


In [12]:
!zcat /home/be68/Hyphosphere/data/MiSeq/20181126/86058_C4M53_Exp10_16Slib1_S1_R1_001.fastq.gz | head

@M02465:481:000000000-C4M53:1:2105:15715:1330 1:Y:0:1
TCCTTCTTTTTCTCTCTTTTTTCTTCTTTCCTTTGCTTCCCGCTTTCGTCTGCGTTTCTTTCCGTCCCTTTTTCCCTCCCTCCGCTCCCCCTTGGCCTTTCCTTTTCTCCTTCTTTTCTTTCGTTCTTTCGCTGTTTCTGTCCTTCCTCTTTTCTCTTTGCCCTTCTTCTCTCTCCTCCCTCCCCCCCTTTTCTCCTTCTTCTTTCTTTTCCTTTCCTTCCTCTTCTTTTCTCCCTCTTTTTTCTCCCCCC
+
>1>11BB1311B133B1BF110F1333DB31D331DDG1F000///1/0///1///A121121/B0//111B1011B>//00//BB///?//001011B1B11B212>2BF>11111BF112/<00/>12////<112222212@10111@1?11?11?1111?<<1?11<>1>10100.00.<.<<--;00;00000;0000009;0000000;00;9090;C009000;000//:/00//-//9//9;-
@M02465:481:000000000-C4M53:1:2105:15528:1332 1:Y:0:1
TCCTTCTTTTTCTCTCTTTTTTCTTCTTTCCTTTTCTTCCCTCTCTCTTCTTCTTTTTTTTCCTTCTTTTTTTCCCTCTCTCTTCTCCCCCTTTTCCTTTCTTTTTCTCCTTTCTTTCTTTCTTTCTTTCTTTTTTTTCTTCCTTCCTTTTTTCTCTTTTCCCTTCTTCTCTCTTCTTTTTCCCCCCCTTTTCTCCTTCTTCTTTCTTTTCCTTTCTTTCCTCTTCTTTTCTCCCTCTTTTTTCTCCCCCC
+
>>A111BB@11B1B3BAFG110BDFG3D3D1DDBFDGHGGGH0B011B2DFF2DF111/AAAADB11111BEE0BF@>00111@BB1@>?//011211>B>2>>>10>BBF121211BF1B>2B1

## Create new R1 and R2 files for each sample based on index barcodes
This block will read in your read1, read2, index1, and index2 files, compare barcode sequences to the provided mapping file, and create new files based on the sample names provided in the mapping file.

In [13]:
idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["SampleID"]) for i, row in ir_df.iterrows()])

In [15]:
!ls /home/be68/Hyphosphere/data/MiSeq/20181126/

86058_C4M53_Exp10_16Slib1_S1_I1_001.fastq.gz
86058_C4M53_Exp10_16Slib1_S1_I2_001.fastq.gz
86058_C4M53_Exp10_16Slib1_S1_R1_001.fastq.gz
86058_C4M53_Exp10_16Slib1_S1_R2_001.fastq.gz
download.sh


In [None]:
read1_fhs = dict([(s, gzip.open("/home/be68/Hyphosphere/data/MiSeq/20181126/%s.R1.fq.gz"%s, "w")) for s in ir_df["SampleID"]])
read2_fhs = dict([(s, gzip.open("/home/be68/Hyphosphere/data/MiSeq/20181126/%s.R2.fq.gz"%s, "w")) for s in ir_df["SampleID"]])

idx_d = dict([(row["rev_barcode"].upper() + row["fwd_barcode"].upper(), row["SampleID"]) for i, row in ir_df.iterrows()])

path = "/home/be68/Hyphosphere/data/MiSeq/20181126/"
ir1 = screed.open(path + "86058_C4M53_Exp10_16Slib1_S1_I1_001.fastq.gz")
ir2 = screed.open(path + "86058_C4M53_Exp10_16Slib1_S1_I2_001.fastq.gz")
r1 = screed.open(path + "86058_C4M53_Exp10_16Slib1_S1_R1_001.fastq.gz")
r2 = screed.open(path + "86058_C4M53_Exp10_16Slib1_S1_R2_001.fastq.gz")

seq_cnt = Counter()
bc_cnt = Counter()
bad_bc_count = 0

for i, recs in enumerate(izip(ir1, ir2, r1, r2)):
    ir1 = recs[0]
    ir2 = recs[1]
    r1 = recs[2]
    r2 = recs[3]
        
    idx_key = ir1.sequence + ir2.sequence
    bc_cnt[idx_key] += 1
    
    try:
        sample_name = idx_d[idx_key]
    except KeyError:
        bad_bc_count += 1
        continue
    
    seq_cnt[sample_name] += 1
    
    fastq1 = "@%s\n%s\n+\n%s\n"%(r1.name, r1.sequence, r1.quality)
    read1_fhs[sample_name.encode()].write(fastq1)

    fastq2 = "@%s\n%s\n+\n%s\n"%(r2.name, r2.sequence, r2.quality)
    read2_fhs[sample_name].write(fastq2)
    
for i, j in zip(read1_fhs.itervalues(), read2_fhs.itervalues()):
    i.close(), j.close()

In [22]:
!ls -l /home/be68/Hyphosphere/data/MiSeq/20181126/

total 11911728
-rw-rw-r-- 1 be68 be68   10992479 Nov 29 05:02 100_5.5BP.R1.fq.gz
-rw-rw-r-- 1 be68 be68   12518886 Nov 29 05:02 100_5.5BP.R2.fq.gz
-rw-rw-r-- 1 be68 be68   12079093 Nov 29 05:02 101_5.5CS.R1.fq.gz
-rw-rw-r-- 1 be68 be68   14248378 Nov 29 05:02 101_5.5CS.R2.fq.gz
-rw-rw-r-- 1 be68 be68   11278978 Nov 29 05:02 10_2.1RH.R1.fq.gz
-rw-rw-r-- 1 be68 be68   12950166 Nov 29 05:02 10_2.1RH.R2.fq.gz
-rw-rw-r-- 1 be68 be68   13850714 Nov 29 05:02 102_5.5BS.R1.fq.gz
-rw-rw-r-- 1 be68 be68   16228008 Nov 29 05:02 102_5.5BS.R2.fq.gz
-rw-rw-r-- 1 be68 be68   16755039 Nov 29 05:02 103_6.5BP.R1.fq.gz
-rw-rw-r-- 1 be68 be68   20399688 Nov 29 05:02 103_6.5BP.R2.fq.gz
-rw-rw-r-- 1 be68 be68   14407336 Nov 29 05:02 104_6.5CS.R1.fq.gz
-rw-rw-r-- 1 be68 be68   16947580 Nov 29 05:02 104_6.5CS.R2.fq.gz
-rw-rw-r-- 1 be68 be68   15762196 Nov 29 05:02 105_6.5BS.R1.fq.gz
-rw-rw-r-- 1 be68 be68   17839893 Nov 29 05:02 105_6.5BS.R2.fq.gz
-rw-rw-r-- 1 be68 be68   12089148 Nov 29 05:02 1

In [23]:
print "We found %s barcodes that did not match the mapping file."%str(bad_bc_count)

We found 6601181 barcodes that did not match the mapping file.


In [25]:
pickle.dump(seq_cnt, open("/home/be68/Hyphosphere/data/MiSeq/20181126/seq_cnt.pickle", "w"))
pickle.dump(bc_cnt, open("/home/be68/Hyphosphere/data/MiSeq/20181126/bc_cnt.pickle", "w"))

In [27]:
seq_cnt = pickle.load(open("/home/be68/Hyphosphere/data/MiSeq/20181126/seq_cnt.pickle"))

# Check sequence counts per sample

In [28]:
df = pd.DataFrame.from_dict(dict(seq_cnt.most_common()), orient="index")
df.rename(columns={0 : "count"}, inplace=True)
df.sort_values(by="count", inplace=True, ascending=False)
df["color"] = "#F8766D"
df.reset_index(inplace=True)
df.rename(columns={"index" : "SampleID"}, inplace=True)
df["x"] = [i + 1 for i, s in enumerate(df["SampleID"])]
    
p = figure(width=800, height=400, y_axis_type = "log", 
           tools="", toolbar_location="left", 
           y_axis_label = "Seq count", x_axis_label = "Sample")

p.xaxis.axis_line_width = 3
p.yaxis.axis_line_width = 3
p.outline_line_color = None
p.grid.grid_line_color = None

source = ColumnDataSource(df)

invisible_circle = Circle(x='x', y='count', 
                          fill_color='color', 
                          fill_alpha=0.5, 
                          line_color="color", 
                          line_alpha = 0.5, size=8)

visible_circle = Circle(x='x', y='count', 
                        fill_color='color', 
                        fill_alpha=1.0, 
                        line_color="color")

cr = p.add_glyph(source, 
                 invisible_circle, 
                 selection_glyph=visible_circle, 
                 nonselection_glyph=invisible_circle)

l = p.line(x = df["x"], 
           y = df["count"], 
           line_width=3, 
           color='#F8766D')

code = "source.set('selected', cb_data['index']);"
callback = CustomJS(args={'source': source}, code=code)

tooltip = """
    <div>
        <span style="font-size: 17px; font-weight: bold;">@SampleID </span>
    </div>
    <div>
        <span style="font-size: 17px; font-weight: bold;">@count </span>
    </div>
"""

p.add_tools(HoverTool(tooltips=tooltip, callback=callback, renderers=[cr]),
            BoxZoomTool(dimensions=["width"]),
            ResetTool(),
            CrosshairTool(dimensions = ["height"]))

p.xaxis.major_label_text_color = "white"

sh = show(p),

NameError: name 'figure' is not defined

In [29]:
seq_cnt.most_common(192)

[('189_6.3CH', 141070),
 ('103_6.5BP', 130629),
 ('3_2.1CH', 124941),
 ('7_2.1CS', 124130),
 ('105_6.5BS', 118082),
 ('6_3.1CH', 117577),
 ('112_4.2BS', 116169),
 ('104_6.5CS', 110489),
 ('1_3.1RH', 108818),
 ('25_2.3CH', 108505),
 ('8_1.1BP', 107665),
 ('15_1.2BP', 107388),
 ('184_6.5RH', 106264),
 ('80_2.2RT', 104603),
 ('102_5.5BS', 104470),
 ('87_5.2CS', 101432),
 ('81_3.2RT', 100561),
 ('136_5.4RT', 100558),
 ('27_2.3RH', 99073),
 ('24_1.3BP', 98965),
 ('110_4.1RT', 98408),
 ('130_4.3RT', 96462),
 ('109_5.1RT', 95005),
 ('89_2.2BS', 92869),
 ('14_1.2CH', 92866),
 ('114_3.2CS', 92776),
 ('106_6.1BP', 92620),
 ('16_2.2RH', 92613),
 ('111_4.2BP', 91875),
 ('73_1.3RT', 91869),
 ('83_2.1RT', 90957),
 ('2_3.1BP', 90957),
 ('183_6.5CH', 90415),
 ('101_5.5CS', 90291),
 ('36_2.4RH', 89590),
 ('10_2.1RH', 89188),
 ('28_2.3BP', 88821),
 ('119_4.3BP', 88772),
 ('97_4.5BP', 88702),
 ('163_4.4CH', 88551),
 ('120_6.2CS', 88366),
 ('98_4.5CS', 88292),
 ('191_Blank', 87935),
 ('4_2.1BP', 87354),
 

In [34]:
sum(seq_cnt.values())

13821709

In [35]:
!zcat /home/be68/Hyphosphere/data/MiSeq/20181126/163_4.4CH.R1.fq.gz | head

@M02465:481:000000000-C4M53:1:2105:14074:1920 1:N:0:1
GACATAGGTGGCAAACGTTATCCGGATTTATTGGGCGTAAAGGATGCGTAGATGGCGGAATAAGTTATCAGTAGAAGCTAGGCTCAACTCAACGGAAGCTGATAATACTATTCTGCTAGAGGACAGGAGAGGTTGGCGGAACTCTATGCGTAGGGGTGAAATCCATTGATCTATAGAGGACCACCAAAAAAGGCGAAGGCAGCCAACTATTCTGCTCCTGACATTGAGGCATGAAAGCGTGGGGAGCAAAA
+
?AAAAFFFFFAF1F1FEGGCF3GFGA0EGGGHHHHHGGCGHHF0BFEEEGG/2DGCCC??FGHHHFBGD2FG2@BFDGFF1FCGF1BF21@1BEECEF/G11FB2FGB2BFGEGFF>>F1B00/0/F0C/CAFGHCCC///<11>=FDFFG.<.<.<C.0=0;/00=:GHHHHHHBCGFA.:.AEEGEA??C?E@--A?A?BBEFFEFFFFFFFFFFBBFFFFFFFBFFEFF-;99BFFBF@@<--AEBFF
@M02465:481:000000000-C4M53:1:2105:15227:1996 1:N:0:1
TACGTAGGGGGCAAGCGTTGTCCGGACTTACTGGGCGTAAAGCGCGCGCAGTCGGACGAGTATGTGCCGTGTGAAAGCGCCGAACTTCCCTCGGCGAGGCCATGGTAGACAGCTCGTCTTGAGCCTCGGAGAGGTCGGTGGAATTCCGGGTGGAGCGGTGAAATGCGTAGAGATCCGGAGGAACACCAGGGGCGAATGCGGCCGACTGGCCGAGTGCTGACGCTGAGGCGCGACGGCGTGGGGAGCAAACA
+
>ABAAFFBBBDBEGGGGGGGFDGGGGEGFHFHGFHGFFGGHHGGGGGG?EGEFEFCEG?EEH4BFEFGGAEGHGHHHHGGGGG/BFFGH0?F<D/->DC-<<CC/==GHH0CCCGG@AHGGHH0C