In [None]:
#### Start in KI_Compartment root folder
# You must have SymITS2 and all its dependencies installed and working from your home folder (i.e. at ~/SymITS2)
# You must have a working version of QIIME, with UCLUST and USEARCH61 installed.
# You must have the nucleotide NCBI blast database locally on your machine, and a working version of blast
    # You might want to put this on a hard drive, as it is currently ~94GB of data (as of August 1, 2017)
    # Download Blastdb from ftp://ftp.ncbi.nlm.nih.gov/blast/documents/blastdb.html, dataset: nt.*tar.gz.
    # Then make sure to unzip all folders and place in one folder called "blastdb"
    # Mine is at "/media/danielle/CoralDrive/blastdb"
    # For getting blast installed on Linux Ubuntu, this website was helpful: http://www.blopig.com/blog/2014/04/quick-standalone-blast-setup-for-ubuntu-linux/
# You must have an updated version of R, I used R 3.4.1 (2017-06-30, Single Candle)
    # Required R packages are specified in SymITS2

# Prep raw data
chmod u+x data/Bioinf/extract_unzip_prep_rawdata.sh # Make sure script can run
data/Bioinf/extract_unzip_prep_rawdata.sh # This does the following:
    # Unzips all original files
    # Makes configs for Bokulich qc
    # Runs Bokulich qc
    # Makes configs for Illumina paired end merging
    # Merges Illumina pairs, includes Q30 check
    # Filters merged reads

In [None]:
# Next, start using SymITS2 by Ross Cunning https://github.com/jrcunning/SymITS2
~/SymITS2/qc_trim_reads.sh # This does the following:
    # Adds QIIME labels
    # Identifies chimeric sequences
    # Filters out chimeric sequences
    # Runs cutadapt 4 times
    # Removes intermediate files from cutadapt process

In [1]:
count_seqs.py -i data/fasta/combined_seqs_trimmed.fasta # Count how many seqs there are after quality filtering


1992785  : data/fasta/combined_seqs_trimmed.fasta (Sequence lengths (mean +/- std): 299.5748 +/- 4.5993)
1992785  : Total


In [None]:
# From SymITS2 by Ross Cunning https://github.com/jrcunning/SymITS2
# Cluster at 97% within samples (each sample is clustered independently)
~/SymITS2/otus_97_bysample.sh data/fasta/combined_seqs_trimmed.fasta data/Bioinf/clust
# Arguments are 1) Combined, trimmed sequences (fasta file) and 2) Output directory
# Creates data/Bioinf/clust/all_rep_set_rep_set.fasta for downstream use

In [None]:
# From SymITS2 by Ross Cunning https://github.com/jrcunning/SymITS2
# Use global alignment to identify sequences
R --vanilla < ~/SymITS2/run_nw.R --args data/Bioinf/clust/all_rep_set_rep_set.fasta data/ITS2db_trimmed_derep.fasta
# Arguments are 1) query sequences - rep set fasta file, and 2) reference database sequences in .fasta format
# Creates data/Bioinf/clust/all_rep_set_rep_set_nw_tophits.tsv for downstream use

In [None]:
# From SymITS2 by Ross Cunning https://github.com/jrcunning/SymITS2
# Build a phyloseq object
R --vanilla < ~/SymITS2/build_phyloseq.R --args data/Bioinf/clust/all_rep_set_rep_set_nw_tophits.tsv data/Coralphoto__Metadata/KI_Compartment_metadata.tsv data/Bioinf/clust/97_otus_bysample.tsv data/ITS2db_trimmed_notuniques_otus.txt analyses/KI_Compartment.RData
# Arguments are 1) nw_tophits - taxonomic assignment output from run_nw.R, 2) sample metadata in tsv format, 3) OTU table, 4) duplicate reference taxa names, 5) output filename
# Creates analyses/KI_Compartment.RData

In [None]:
# From SymITS2 by Ross Cunning https://github.com/jrcunning/SymITS2
# Filter out sequences which do not blast to Symbiodnium
R --vanilla < ~/SymITS2/filter_notsym.R --args analyses/KI_Compartment.RData data/Bioinf/clust/all_rep_set_rep_set.fasta analyses/KI_Compartment.RData /media/danielle/CoralDrive/blastdb
# Output file is KI_Compartment.RData for downstream analysis (next step = analyses/KI_Compartment_filter_samples.R)

In [None]:
# Build the phylogenetic tree to add to phyloseq object
chmod u+x data/Bioinf/tree/build_phy_tree.sh # Make sure script can run
data/Bioinf/tree/build_phy_tree.sh # Run build_phy_tree script

In [None]:
# Format and filter the RData file
R --vanilla < analyses/KI_Compartment_filter_samples.R
# Input file is analyses/KI_Compartment.RData
# Must run build_phy_tree first for this to work properly
# Output file is: data/KI_seqs_f_coral_grouped.RData

In [None]:
# Rarefaction
# Multiple rarefactions to look at different sequence rarefaction depths
# 1000 to 10000 in steps of 1000
multiple_rarefactions.py -i data/Bioinf/clust/97_otus_bysample.biom -o data/Bioinf/clust/rarefy/mult -m 1000 -x 10000 -s 1000
# 200 to 1000 in steps of 100
multiple_rarefactions.py -i data/Bioinf/clust/97_otus_bysample.biom -o data/Bioinf/clust/rarefy/mult -m 200 -x 1000 -s 100

# Single rarefaction to 1000 seqs
single_rarefaction.py -i data/Bioinf/clust/97_otus_bysample.biom -o data/Bioinf/clust/rarefy/rarefaction_1000.biom --depth 1000
# Convert .biom to tsv
biom convert -i data/Bioinf/clust/rarefy/rarefaction_1000.biom -o data/Bioinf/clust/rarefy/rarefaction_1000.tsv --to-tsv
# Build a phyloseq object
R --vanilla < ~/SymITS2/build_phyloseq.R --args data/Bioinf/clust/all_rep_set_rep_set_nw_tophits.tsv data/Coralphoto__Metadata/KI_Compartment_metadata.tsv data/Bioinf/clust/rarefy/rarefaction_1000.tsv data/ITS2db_trimmed_notuniques_otus.txt analyses/KI_Compartment_rarefy.RData
# Arguments are 1) nw_tophits - taxonomic assignment output from run_nw.R, 2) sample metadata in tsv format, 3) OTU table, 4) duplicate reference taxa names, 5) output filename
# Creates analyses/KI_Compartment.RData

# Filter out sequences which do not blast to Symbiodnium
R --vanilla < ~/SymITS2/filter_notsym.R --args analyses/KI_Compartment_rarefy.RData data/Bioinf/clust/all_rep_set_rep_set.fasta analyses/KI_Compartment_rarefy.RData /media/danielle/CoralDrive/blastdb

# Build the phylogenetic tree to add to phyloseq object
chmod u+x data/Bioinf/tree/build_phy_tree_rare.sh # Make sure script can run
data/Bioinf/tree/build_phy_tree_rare.sh # Run build_phy_tree script

# Format and filter the RData file
R --vanilla < analyses/KI_Compartment_filter_samples_rare.R