# OTU binning
Following updated buckley lab walkthrough from github

In [1]:
workDir = '/home/bryan/ERA/data/MiSeq/20170417_run1/OTU_binning/' 
seqDir = '/home/bryan/ERA/data/MiSeq/20170417_run1/QC/' 
databaseDir = '/home/bryan/RhizCG/data/databases/'

seqFile = '/home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC.fasta'
nprocs = 24

In [2]:
import os
import re
import glob
import pandas as pd
from qiime.assign_taxonomy import UclustConsensusTaxonAssigner
from IPython.display import Image
from cogent.app.usearch import clusters_from_blast_uc_file
from cogent.parse.fasta import MinimalFastaParser as parse
import sys

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)

Attaching package: ‘dplyr’



    filter, lag



    intersect, setdiff, setequal, union


Attaching package: ‘gridExtra’



    combine




In [5]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)

# Concatenate section 
refer to github if binning OTUs from multiple runs

In [6]:
%%bash -s "$workDir" "$seqFile"

cd $1

mothur "#unique.seqs(fasta=$2)" | head -n 50

[H[2J





mothur v.1.39.5
Last updated: 3/20/2017

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

For questions and analysis support, please visit our forum at https://www.mothur.org/forum

Type 'quit()' to exit program



mothur > unique.seqs(fasta=/home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC.fasta)
1000	4
2000	10
3000	10
4000	10
5000	11
6000	11
7000	11
8000	11
9000	11
10000	14
11000	14
12000	14
13000	14
14000	14
15000	17
16000	17
17000	17
18000	17
19000	20
20000	20


In [7]:
%%bash -s "$workDir"

cd $1

ln -f -s ../QC/finalQC.unique.fasta .

printf "Number of sequences in final QC unique fasta: "
grep -c ">" finalQC.unique.fasta

head -n 4 finalQC.unique.fasta
tail -n 4 finalQC.unique.fasta

Number of sequences in final QC unique fasta: 5840960
>ERA-T2_1-2a_85_0
TACGTAAGGGCCGAGCGTTGTCCGGAGTTACTGGGCGTAAAGCGCGCGCAGGCGGCTCGCTTTGCCCGGCGTGAAAGCCCCCGGCTCAACCGGGGAGGGTCGTCGGGGACGGGCGAGCTTGAGGCCGGCAGGGGCAGGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGAGATCGGGAGGAACACCCGTGGCGAAGGCGGCCTGCTGGGCCGGACCTGACGCTGAGGCGCGAAGGCGTGGGGAGCGAACGGG
>ERA-T1_2-2d_85_1
TACAGAGGGTGCAAGCGTTGTTCGGAATCATTGGGCGTAAAGGGCGTGTAGGCGGTCTGCTAAGTCATGTGTGAAATCCCTCGGCTCAACCGGGGAACGACGCATGAAACTGACAAGCTAGAGTACCAAAGAGGGGGGTGGAATTCCCGGTGTAGCGGTGAAATGCGTAGATATCGGGAGGAACACCGGTGGCGAAGGCGGCCCCCTGGTTGGATACTGACGCTGAGACGCGAAAGCGTGGGGAGCAAACAGG
>ERA-T1_2-2b_0_14035440
TACGGAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTCTGTCGCGTCGGATGTGAAAGCCCGGGGCTCAACCCCGGACTTGCAGTGGGTACGGGCAGACTAGAGTGTGGGAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCCACTACTGACGCTGAGAAGCGAAAGCATGGGGAGCGAACAGG
>ERA-T2_4-2a_85_14028569
TACGGAGGGGGCGAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCGCCGCAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACT

# Format for Usearch

In [8]:
counts = {}
inFile = os.path.join(seqDir, 'finalQC.names')
with open(inFile) as iFH:
    for line in iFH:
        seedID, seqIDs = line.split("\t")
        count = len(seqIDs.split(","))
        counts[seedID] = count 

In [9]:
outFile = os.path.join(workDir, 'finalQC.unique.usearch_names.fasta')
with open(outFile, 'w') as oFH:
    inFile = os.path.join(workDir, 'finalQC.unique.fasta')
    for n, s in parse(open(inFile)):
        if counts[n] > 1:
            oFH.write(">%s;size=%s;\n%s\n"%(n,counts[n],s))
        else:
            continue

In [10]:
!cd $workDir; \
head -n 6 finalQC.unique.usearch_names.fasta

>ERA-T2_1-2a_85_0;size=3;
TACGTAAGGGCCGAGCGTTGTCCGGAGTTACTGGGCGTAAAGCGCGCGCAGGCGGCTCGCTTTGCCCGGCGTGAAAGCCCCCGGCTCAACCGGGGAGGGTCGTCGGGGACGGGCGAGCTTGAGGCCGGCAGGGGCAGGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGAGATCGGGAGGAACACCCGTGGCGAAGGCGGCCTGCTGGGCCGGACCTGACGCTGAGGCGCGAAGGCGTGGGGAGCGAACGGG
>ERA-T1_2-2d_85_1;size=90;
TACAGAGGGTGCAAGCGTTGTTCGGAATCATTGGGCGTAAAGGGCGTGTAGGCGGTCTGCTAAGTCATGTGTGAAATCCCTCGGCTCAACCGGGGAACGACGCATGAAACTGACAAGCTAGAGTACCAAAGAGGGGGGTGGAATTCCCGGTGTAGCGGTGAAATGCGTAGATATCGGGAGGAACACCGGTGGCGAAGGCGGCCCCCTGGTTGGATACTGACGCTGAGACGCGAAAGCGTGGGGAGCAAACAGG
>ERA-T3_2-2c_170_3;size=1466;
TACGTAGGGGTCGAGCGTTGTCCGGAGTTACTGGGCGTAAAGCGTGCGCAGGCGGCTCATTGCGCCCGGCGTGAAAGCCCCCGGCTCAACCGGGGAGGGTCGTCGGGGACGGGTGAGCTTGAGGGTATCAGGGGCTGGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGAGATCGGGAGGAACACCCGTGGCGAAGGCGGCCAGCTGGGATACACCTGACGCTGAGGCACGAAGGCGTGGGGAGCGAACGGG


# Usearch Pipeline

* sort sequences by size

In [11]:
!cd $workDir; \
usearch \
-sortbysize finalQC.unique.usearch_names.fasta \
-fastaout finalQC_uniques_sorted.fasta \
-minsize 2

usearch v9.2.64_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013-16 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

License: seb369@cornell.edu

00:02 331Mb   100.0% Reading finalQC.unique.usearch_names.fasta
00:02 297Mb  Getting sizes                                     
00:02 305Mb  Sorting 926204 sequences
00:04 308Mb   100.0% Writing output


In [12]:
!cd $workDir; \
head finalQC_uniques_sorted.fasta; \
tail finalQC_uniques_sorted.fasta

>ERA-T3_1-2c_170_65;size=161902;
GACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTCAAGTCCGCCGTCAAATCCC
AGGGCTCAACCCTGGACAGGCGGTGGAAACTACCAAGCTGGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTG
AAATGCATTGAGATCGGAAAGAACACCAACGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGCTAG
GGGAGCAAATGGG
>ERA-T2_3-1b_85_27;size=87257;
TACCAGCACCCCGAGTGGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCGGTCTTGCAAGTCTTCCGTTAAATCCA
CCTGCTTAACAGATGGGCTGCGGAAGATACTACAAGACTAGGAGGCGGGAGAGGCAAGCGGTACTCAGTGGGTAGGGGTA
AAATCCTCTGATCCATTGAAGACCACCAGTGGCGAAGGCGGCTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGG
GGGAGCAAACCGG
>ERA-T2_4-3d_85_5360919;size=2;
GACGAACCGTGCGAACGTTGTTCGGAATCACTGGGCTTAAAGGGCGCGTAGGCGGGTTTTCAAGTCTGTGGTGAAATCCT
CCAGCTTAACTGGAGAAGTGCCGTGGATACTGGAGACCTCGAGGAGGGTAGGGGCATCTGGAACAGCCGGTGGATCGGTG
AAATGCGTTGATATCGGCTGGAACTCCGATGGCGAAGGCAAGGTGCTGGACCCTATCTGACGCTGAGGCGCGAAAGCCAG
GGGAGCGAACGGG
>ERA-T3_2-2d_85_5360912;size=2;
TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGTGCGTAGGTGGTTCGTTAAGTCTGATGTGAAAGCCC
TGG

* Cluster OTUs using sorted sequence file

In [13]:
!cd $workDir; \
usearch \
-cluster_otus finalQC_uniques_sorted.fasta \
-otus otus.fasta

usearch v9.2.64_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013-16 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

License: seb369@cornell.edu

**OUTPUT MUTED**8.3% 9768 OTUs, 32715 chimeras

## rename OTUs with simple name

In [14]:
%%bash -s "$workDir"
cd $1

bioawk -c fastx '{print ">" "OTU" "." NR "\n" $seq}' otus.fasta > otusn.fasta

In [15]:
!cd $workDir; head -n 4 otusn.fasta

>OTU.1
GACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTCAAGTCCGCCGTCAAATCCCAGGGCTCAACCCTGGACAGGCGGTGGAAACTACCAAGCTGGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCATTGAGATCGGAAAGAACACCAACGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGCTAGGGGAGCAAATGGG
>OTU.2
TACCAGCACCCCGAGTGGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCGGTCTTGCAAGTCTTCCGTTAAATCCACCTGCTTAACAGATGGGCTGCGGAAGATACTACAAGACTAGGAGGCGGGAGAGGCAAGCGGTACTCAGTGGGTAGGGGTAAAATCCTCTGATCCATTGAAGACCACCAGTGGCGAAGGCGGCTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGG


## assign taxonomy

In [16]:
%%bash -s "$workDir" "$databaseDir" "$nprocs"

cd $1

assign_taxonomy.py \
    -r $2'97_Silva_111_rep_set_no_ambig.fasta' \
    -t $2'Silva_111_taxa_map_full.txt' \
    -i otusn.fasta \
    -o otusn_tax

## remove undesirables

In [17]:
%%bash -s "$workDir" "$databaseDir"

cd $1

egrep "Chloroplast|Eukaryota|Archaea|Unassigned|mitochondria" \
otusn_tax/otusn_tax_assignments.txt |\
awk '{print $1}' > to_remove_tax.accnos

In [18]:
!cd $workDir; \
mothur "#remove.seqs(fasta=otusn.fasta, accnos=to_remove_tax.accnos)" | head -n 50

[H[2J





mothur v.1.39.5
Last updated: 3/20/2017

by
Patrick D. Schloss

Department of Microbiology & Immunology
University of Michigan
http://www.mothur.org

When using, please cite:
Schloss, P.D., et al., Introducing mothur: Open-source, platform-independent, community-supported software for describing and comparing microbial communities. Appl Environ Microbiol, 2009. 75(23):7537-41.

Distributed under the GNU General Public License

Type 'help()' for information on the commands that are available

For questions and analysis support, please visit our forum at https://www.mothur.org/forum

Type 'quit()' to exit program



mothur > remove.seqs(fasta=otusn.fasta, accnos=to_remove_tax.accnos)
Removed 1895 sequences from your fasta file.

Output File Names: 
otusn.pick.fasta


mothur > quit()


In [19]:
!printf "Pre-filter: number of sequences: "
!cd $workDir; grep -c ">" otusn.fasta
!printf "Post-filter: number of sequences: "
!cd $workDir; grep -c ">" otusn.pick.fasta

Pre-filter: number of sequences: 15814
Post-filter: number of sequences: 13919


# Mapping reads

* reformat for usearch

In [20]:
%%bash -s "$seqDir"
cd $1

perl -pe 's/^>(.+)(_[^_]+)\n$/>$1$2\_$.;barcodelabel=$1\n/' finalQC.fasta > finalQC_usearchfmt.fasta

In [21]:
!cd $seqDir; head -n 6 finalQC_usearchfmt.fasta

>ERA-T2_1-2a_85_0_1;barcodelabel=ERA-T2_1-2a_85
TACGTAAGGGCCGAGCGTTGTCCGGAGTTACTGGGCGTAAAGCGCGCGCAGGCGGCTCGCTTTGCCCGGCGTGAAAGCCCCCGGCTCAACCGGGGAGGGTCGTCGGGGACGGGCGAGCTTGAGGCCGGCAGGGGCAGGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGAGATCGGGAGGAACACCCGTGGCGAAGGCGGCCTGCTGGGCCGGACCTGACGCTGAGGCGCGAAGGCGTGGGGAGCGAACGGG
>ERA-T2_3-3a_85_4356060_3;barcodelabel=ERA-T2_3-3a_85
TACGTAAGGGCCGAGCGTTGTCCGGAGTTACTGGGCGTAAAGCGCGCGCAGGCGGCTCGCTTTGCCCGGCGTGAAAGCCCCCGGCTCAACCGGGGAGGGTCGTCGGGGACGGGCGAGCTTGAGGCCGGCAGGGGCAGGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGAGATCGGGAGGAACACCCGTGGCGAAGGCGGCCTGCTGGGCCGGACCTGACGCTGAGGCGCGAAGGCGTGGGGAGCGAACGGG
>ERA-T3_3-1c_0_9817383_5;barcodelabel=ERA-T3_3-1c_0
TACGTAAGGGCCGAGCGTTGTCCGGAGTTACTGGGCGTAAAGCGCGCGCAGGCGGCTCGCTTTGCCCGGCGTGAAAGCCCCCGGCTCAACCGGGGAGGGTCGTCGGGGACGGGCGAGCTTGAGGCCGGCAGGGGCAGGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGAGATCGGGAGGAACACCCGTGGCGAAGGCGGCCTGCTGGGCCGGACCTGACGCTGAGGCGCGAAGGCGTGGGGAGCGAACGGG


## Split into smaller files

In [None]:
!cd $seqDir; \
du -h finalQC_usearchfmt.fasta

4.1G	finalQC_usearchfmt.fasta


In [None]:
#spliting file
!cd $seqDir; \
pyfasta split -n 5 finalQC_usearchfmt.fasta

In [None]:
g = os.path.join(seqDir, 'finalQC_usearchfmt.*.fasta')
fileList = glob.glob(g)
fileList  

['/home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.4.fasta',
 '/home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.3.fasta',
 '/home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.1.fasta',
 '/home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.2.fasta',
 '/home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.0.fasta']

In [None]:
# running usearch on each split file
for f in fileList:
    sys.stderr.write('Processing {}\n'.format(f))

    ff,_ = os.path.splitext(f)
    _,i = os.path.splitext(ff)
    uc = 'readmap{}.uc'.format(i.lstrip('.')) 

    !cd $workDir; \
        usearch \
        -usearch_global $f \
        -db otusn.pick.fasta \
        -strand plus -id 0.97 \
        -uc $uc \
        -threads $nprocs

usearch v9.2.64_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013-16 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

License: seb369@cornell.edu

00:00 44Mb    100.0% Reading otusn.pick.fasta
00:00 10Mb    100.0% Masking (fastnucleo)    
00:00 11Mb      0.1% Word stats          

Processing /home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.4.fasta


00:00 11Mb    100.0% Word stats
00:00 11Mb    100.0% Alloc rows
00:00 24Mb    100.0% Build index
00:29 304Mb   100.0% Searching finalQC_usearchfmt.4.fasta, 66.7% matched
usearch v9.2.64_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013-16 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

License: seb369@cornell.edu

00:00 44Mb    100.0% Reading otusn.pick.fasta
00:00 10Mb    100.0% Masking (fastnucleo)    
00:01 11Mb     30.9% Word stats          

Processing /home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.3.fasta


00:01 11Mb    100.0% Word stats
00:01 11Mb    100.0% Alloc rows
00:01 24Mb    100.0% Build index
00:29 305Mb   100.0% Searching finalQC_usearchfmt.3.fasta, 66.7% matched
usearch v9.2.64_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013-16 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

License: seb369@cornell.edu

00:00 44Mb    100.0% Reading otusn.pick.fasta
00:01 10Mb    100.0% Masking (fastnucleo)    
00:01 11Mb      0.1% Word stats          

Processing /home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.1.fasta


00:01 11Mb    100.0% Word stats
00:01 11Mb    100.0% Alloc rows
00:01 24Mb    100.0% Build index
00:30 304Mb   100.0% Searching finalQC_usearchfmt.1.fasta, 66.6% matched
usearch v9.2.64_i86linux32, 4.0Gb RAM (132Gb total), 40 cores
(C) Copyright 2013-16 Robert C. Edgar, all rights reserved.
http://drive5.com/usearch

License: seb369@cornell.edu

00:00 44Mb    100.0% Reading otusn.pick.fasta
00:00 10Mb    100.0% Masking (fastnucleo)    
00:00 11Mb      0.1% Word stats          

Processing /home/bryan/ERA/data/MiSeq/20170417_run1/QC/finalQC_usearchfmt.2.fasta


00:00 11Mb    100.0% Word stats
00:00 11Mb    100.0% Alloc rows
00:00 24Mb    100.0% Build index
**OUTPUT MUTED**9.9% Searching finalQC_usearchfmt.2.fasta, 66.9% matched

* after creating readmapped files in parrallel, concat them back together

In [None]:
!cd $workDir; \
cat readmap[0-9].uc > readmap_all.uc

* convert from usearch to OTU table

In [None]:
!cd $workDir; \
python /opt/edgar_python_scripts/uc2otutab.py readmap_all.uc > otu_table.txt

**OUTPUT MUTED**76.0%   

* convert from OTU table to biom format

In [None]:
%%bash -s "$workDir"

cd $1

if [ -f otu_table.biom ]; then
rm otu_table.biom
fi 

biom convert -i otu_table.txt -o otu_table.biom --to-hdf5 --table-type "OTU table"

* create biom table summary

In [None]:
%%bash -s "$workDir"

cd $1

if [ -f otu_table_summary.txt ]; then
rm otu_table_summary.txt
fi 

biom summarize-table -i otu_table.biom -o otu_table_summary.txt

In [None]:
!cd $workDir; cat otu_table_summary.txt

Num samples: 252
Num observations: 13919
Total count: 9289527
Table density (fraction of non-zero values): 0.228

Counts/sample summary:
 Min: 1.0
 Max: 118274.0
 Median: 35433.500
 Mean: 36863.202
 Std. dev.: 16583.991
 Sample Metadata Categories: None provided
 Observation Metadata Categories: None provided

Counts/sample detail:
IndexQC_Rev: 1.0
PostiveControl_C: 25.0
PosControl_B: 29.0
NegControl_C: 38.0
PosControl_A: 73.0
NegControl_A: 150.0
NegControl_Plate: 264.0
NegControl_B: 1271.0
ERA-T1_2-3b_170: 11799.0
ERA-T1_3-4c_170: 13557.0
ERA-T1_3-5c_85: 15984.0
ERA-T2_4-4d_85: 16398.0
ERA-T2_4-3b_85: 16842.0
ERA-T1_1-2b_85: 16989.0
ERA-T3_3-1a_0: 17047.0
ERA-T2_3-1c_85: 17260.0
ERA-T3_4-4a_0: 17824.0
ERA-T3_1-2b_170: 18065.0
ERA-T3_3-1d_85: 18334.0
ERA-T3_3-3a_170: 18572.0
ERA-T3_3-1d_170: 18681.0
ERA-T1_3-3a_0: 19020.0
ERA-T3_3-2c_0: 19466.0
ERA-T1_4-1a_85: 19483.0
ERA-T2_2-3b_85: 19780.0
ERA-T1_3-4c_0: 20259.0
ERA-T2_1-3a_85: 20436.0
ERA-T1

* add taxonomy to biom table

In [None]:
!cd $workDir;\
biom add-metadata -i otu_table.biom \
    -o otu_table_wtax.biom \
    --observation-metadata-fp otusn_tax/otusn_tax_assignments.txt \
    --sc-separated taxonomy \
    --float-fields consensus \
    --int-fields numhits \
    --observation-header OTUID,taxonomy,consensus,numhits