# Description
    Demultiplex ERA Rhizosphere samples: run1
    


# Setting variables

In [1]:
workDir = '/home/bryan/ERA/'
seqDir = '/home/bryan/ERA/data/MiSeq/20170417_run1/'

metadataFile = 'ERA_MappingFile.txt'
read1_file = 'read1.fq'
read2_file = 'read2.fq'
index_read1_file = 'index1.fq'
index_read2_file = 'index2.fq'
read12_screed_file = 'pear_merged-2017-04-18.assembled.fastq_screed'




# Init

In [2]:
import os
import screed
import pandas as pd
from glob import glob
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
#from ggplot import *
import re
import scipy.stats as stats

In [3]:
if not os.path.isdir(workDir):
    os.mkdir(workDir)

In [4]:
%load_ext rpy2.ipython

In [5]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gdata)


Error in library(gdata) : there is no package called ‘gdata’


Attaching package: ‘dplyr’



    filter, lag



    intersect, setdiff, setequal, union





## Uncompressing with four files output

In [6]:
uncompFiles = glob(os.path.join(seqDir, 'index?.fq'))

if len(uncompFiles) != 2:
    !cd $seqDir; \
        pigz -k -d -p 24 index?.fq.gz

In [7]:
# Generate screed database from index files

In [8]:
#long processessing time
os.chdir(seqDir)
screed.read_fastq_sequences(index_read1_file)
screed.read_fastq_sequences(index_read2_file)

<ScreedDB, 'index2.fq_screed'>

## Checkpoint: define index screed files to skip above step

In [9]:
index_read1_screed = index_read1_file + '_screed'
index_read2_screed = index_read2_file + '_screed'

# Demultiplex

In [10]:
# rev-comp functions
def complement(seq): 
    idx = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N'} 
    idx.update( {x.lower():y.lower() for x,y in idx.items()} )
    return ''.join(idx[x] for x in list(seq))

def revComp(seq):
    seq = seq[::-1]
    return complement(seq)

In [11]:
# loading/editing metadata

df_meta = pd.read_table(os.path.join(workDir,metadataFile), delimiter="\t")
df_meta.columns = ['Sample'] + list((df_meta.columns)[1:])

df_meta['Barcode_FR'] = [(revComp(y) + x).lower() for x,y \
                         in zip(df_meta.fwd_barcode,df_meta.rev_barcode)]
#df_meta['Barcode_FR'] = [y.lower() + x.lower() for x,y \
#                         in zip(df_meta.fwd_barcode,df_meta.rev_barcode)]


## format of barcode=>sample dict: [revBarcode=read1 + fwdBarcode=read2] : sampleID
map_d = {x:y for x,y in zip(df_meta.Barcode_FR, df_meta.Sample)}


In [12]:
df_meta

Unnamed: 0,Sample,TubeID,ExtractionPlate,PooledDNAPlate,Sample Well ID,PrimerPlate,Primer Number,Primer Well ID,Unnamed: 8,primerFwdLoc,primerRevLoc,fwd_barcode,rev_barcode,PlotID,TimePoint,Barcode_FR
0,ERA-T1_1-2b_170,125,ERA1,A,A1,1,1,A1,,1-A,1-1,ATCGTACG,AACTCTCG,1-2b_170,T1,cgagagttatcgtacg
1,ERA-T1_1-3c_170,137,ERA1,A,B1,1,2,B1,,1-B,1-1,ACTATCTG,AACTCTCG,1-3c_170,T1,cgagagttactatctg
2,ERA-T3_1-2b_0,530,ERA1,A,C1,1,3,C1,,1-C,1-1,TAGCGAGT,AACTCTCG,1-2b_0,T3,cgagagtttagcgagt
3,ERA-T2_1-2b_85,361,ERA1,A,D1,1,4,D1,,1-D,1-1,CTGCGTGT,AACTCTCG,1-2b_85,T2,cgagagttctgcgtgt
4,ERA-T3_1-2d_170,602,ERA1,A,E1,1,5,E1,,1-E,1-1,TCATCGAG,AACTCTCG,1-2d_170,T3,cgagagtttcatcgag
5,ERA-T1_1-4b_85,143,ERA1,A,F1,1,6,F1,,1-F,1-1,CGTGAGTG,AACTCTCG,1-4b_85,T1,cgagagttcgtgagtg
6,ERA-T1_1-4b_170,142,ERA1,A,G1,1,7,G1,,1-G,1-1,GGATATCT,AACTCTCG,1-4b_170,T1,cgagagttggatatct
7,ERA-T1_1-2c_0,128,ERA1,A,H1,1,8,H1,,1-H,1-1,GACACCGT,AACTCTCG,1-2c_0,T1,cgagagttgacaccgt
8,ERA-T1_1-4a_85,139,ERA1,A,A2,1,9,A2,,1-A,1-2,ATCGTACG,ACTATGTC,1-4a_85,T1,gacatagtatcgtacg
9,ERA-T3_1-4b_0,578,ERA1,A,B2,1,10,B2,,1-B,1-2,ACTATCTG,ACTATGTC,1-4b_0,T3,gacatagtactatctg


In [13]:
# resulting dict of sample => barcode_FR 
n_print = 5
[(map_d.keys()[i], map_d[map_d.keys()[i]]) for i in xrange(n_print)]

[('tgagtacgactatctg', 'ERA-T1_1-2d_85'),
 ('gatctacgcgtgagtg', 'ERA-T1_4-3d_85'),
 ('tatagcgagacaccgt', 'NegControl_A'),
 ('gtaacgagcgtgagtg', 'ERA-T0_3-4c_0'),
 ('tatagcgatacgagac', 'ERA-T2_4-5a_85')]

In [14]:
# loading screed databases 
os.chdir(seqDir)

ir1db = screed.ScreedDB(index_read1_screed)
ir2db = screed.ScreedDB(index_read2_screed)
amp_db = screed.ScreedDB(read12_screed_file)


In [15]:
# demultiplexing: naming reads by sample

outFile = re.sub('fastq_screed', 'dmult.fastq', read12_screed_file)

counter = 0
unassigned = 0
cnt = Counter()
outFile = os.path.join(workDir, outFile)
c = 0
with open(outFile, "w") as out:
    for rec in amp_db.itervalues():
        # debug
        #c += 1
        #if c > 10000:
        #    break
            
        index_concat = ir1db[rec["name"]]["sequence"] + ir2db[rec["name"]]["sequence"]
        
        try:
            new_name = map_d[index_concat.lower()] + "_" + str(counter) #case sensitive
        except KeyError:
            unassigned += 1
            continue
        counter += 1
        cnt[map_d[index_concat.lower()]] += 1 #case sensitive
        s, q = rec["sequence"], rec["quality"]
        out.write("@%s orig_name=%s\n%s\n+\n%s\n"%(new_name,rec["name"],s,q))
        
        
print "Unable to assign %s reads to samples"%unassigned

Unable to assign 3140123 reads to samples


## Checkpoint: define outfile for next command without running above.

In [16]:
outFile = '/home/bryan/ERA/data/MiSeq/20170417_run1/pear_merged-2017-04-18.assembled.dmult.fastq'

In [17]:
# number of sequences
n_lines = !wc -l $outFile
n_lines = int(re.sub(' .+', '', n_lines[0]))
print 'Number of sequences: {}'.format(n_lines/4)

Number of sequences: 14036217


# Stats on sequences

## Number of sequences per sample

In [18]:
# counting sequences for each sample
re_seqName = re.compile(r'_\d+ orig_name.+')

seq_cnt = dict()
with open(outFile, 'r') as fn:
    for line in fn.readlines():
        if re.search(re_seqName, line):
            sampleName = re.sub(re_seqName, '', line).rstrip().lstrip('@')
            try:
                seq_cnt[sampleName] += 1
            except KeyError:
                seq_cnt[sampleName] = 1 

In [19]:
# converting to dataframe
df_seq_cnt = pd.DataFrame(seq_cnt.items(), columns=['Sample', 'seq_count'])
df_seq_cnt

Unnamed: 0,Sample,seq_count
0,ERA-T1_3-3a_170,34912
1,ERA-T3_2-5c_0,50256
2,ERA-T3_3-5c_0,50903
3,ERA-T3_4-3d_170,99796
4,ERA-T3_2-3d_0,44601
5,ERA-T1_3-4c_0,29330
6,ERA-T2_2-1c_85,68659
7,ERA-T3_1-4b_0,48070
8,PostiveControl_C,45
9,ERA-T3_1-3c_0,51283


In [20]:
df_seq_cnt.sort(['seq_count'])

  if __name__ == '__main__':


Unnamed: 0,Sample,seq_count
177,IndexQC_Rev,1
8,PostiveControl_C,45
185,PosControl_B,80
91,NegControl_C,97
181,PosControl_A,246
233,NegControl_A,283
193,NegControl_Plate,373
249,NegControl_B,1439
32,ERA-T1_2-3b_170,17321
59,ERA-T1_3-4c_170,21345
