# Set up notebook environment
## NOTE: Use a QIIME2 kernel

In [None]:
import pandas as pd
import qiime2


# 1. Generate taxon profiles for short-read data
## NOTE: Feature-table of counts of Web of Life gOTUs was obtained from QIITA

## Filter controls from feature-table

In [None]:
%%bash

qiime feature-table filter-samples \
  --i-table woltka_wol_biom.qza \
  --m-metadata-file metadata_samples.txt \
  --p-where 'empo_1 != "Control"' \
  --o-filtered-table woltka_wol_biom_noControls.qza

qiime feature-table summarize \
  --i-table woltka_wol_biom_noControls.qza \
  --o-visualization woltka_wol_biom_noControls.qzv


## Normalize sampling effort
### NOTE: Change the value on line 5 to the desired minimum frequency per sample

In [None]:
%%bash

qiime feature-table filter-samples \
  --i-table woltka_wol_biom_noControls.qza \
  --p-min-frequency 60000 \
  --o-filtered-table woltka_wol_biom_noControls_normalized.qza \

qiime feature-table summarize \
  --i-table woltka_wol_biom_noControls_normalized.qza \
  --o-visualization woltka_wol_biom_noControls_normalized.qzv \


## Estimate alpha-diversity

In [None]:
!qiime diversity alpha \
  --i-table woltka_wol_biom_noControls_normalized.qza \
  --p-metric 'observed_features' \
  --o-alpha-diversity woltka_wol_biom_noControls_normalized_alpha_richness.qza


## Estimate beta-diversity

In [None]:
%%bash

qiime deicode rpca \
  --i-table woltka_wol_biom_noControls_normalized.qza \
  --p-min-feature-count 0 \
  --p-min-sample-count 0 \
  --o-biplot woltka_wol_biom_noControls_normalized_rpca_biplot.qza \
  --o-distance-matrix woltka_wol_biom_noControls_normalized_rpca_dist.qza
    
qiime emperor biplot \
  --i-biplot woltka_wol_biom_noControls_normalized_rpca_biplot.qza \
  --m-sample-metadata-file metadata_samples.txt \
  --m-feature-metadata-file wol_taxonomy.qza \
  --p-number-of-features 10 \
  --o-visualization woltka_wol_biom_noControls_normalized_rpca_biplot.qzv


# 2. Generate taxon profiles for long-read data

## Generate taxonomic profile using the Web of Life database

Concatenate FASTQ files for each sample

In [None]:
%%bash

barcode_list='/path/to/barcode_list.txt'
input_path='/path/to/fastq/pass/'
output_path='/path/to/fastq_concatenated/'
input_fastq_suffix='/*.fastq.gz'
output_fastq_suffix='_all.fastq.gz'

for i in $(cat < "$barcode_list");
do
  cat "$input_path""$i""$fastq_suffix" > "$output_path""$i""output_fastq_suffix"
done


Profile taxonomy

In [None]:
%%bash

barcode_list='/path/to/barcode_list.txt'
input_path='/path/to/fastq_concatenated/'
input_file_suffix='_all.fastq.gz'
centrifuge_index_basename='/projects/wol/release/databases/centrifuge/WoLr1'
output_path='/path/to/centrifuge_profile_long_read/'
output_class_suffix='_classification.txt'
output_report_suffix='_report.txt'

for i in $(cat < "$barcode_list");
do
  centrifuge \
    -U "$input_path""$i""$input_file_suffix" \
    -q \
    --phred33 \
    --threads 1 \
    -x "$centrifuge_index_basename" \
    -S "$output_path""$i""$output_class_suffix" \
    --report-file "$output_path""$i""$output_report_suffix"
done


## Generate feature-table from taxon profile
### NOTE: The files 'ogu_from_maps.py' and 'nucl2g.txt' can be obtained from https://biocore.github.io/wol/protocols/community_ecology

Generate table

In [None]:
%%bash

input_path = '/path/to/centrifuge_profile_long_read/'
output_path = '/path/to/feature_table_long_read/'

ogu_from_maps.py "$input_path" "$output_path" -m centrifuge -e .map.xz -t nucl2g.txt


Convert TSV to BIOM

In [None]:
%%bash

input_path = '/path/to/feature_table_long_read/'
output_path = '/path/to/feature_table_long_read/'

biom convert -i "$input_path"norm.tsv -o "$output_path"centrifuge_wol_norm.biom --table-type="OTU table" --to-hdf5


Filter low-abundance features

In [None]:
%%bash

input_path = '/path/to/feature_table_long_read/'
output_path = '/path/to/feature_table_long_read/'

filter_otus_per_sample.py "$input_path"centrifuge_wol_norm.biom 0.0001 "$output_path"centrifuge_wol_norm_clean.biom


Import BIOM into QIIME2

In [None]:
%%bash

qiime tools import \
  --input-path centrifuge_wol_norm_clean.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path centrifuge_wol_norm_clean.qza

qiime feature-table summarize \
  --i-table centrifuge_wol_norm_clean.qza \
  --o-visualization centrifuge_wol_norm_clean.qzv


## Filter controls from feature-table

In [None]:
%%bash

qiime feature-table filter-samples \
  --i-table centrifuge_wol_norm_clean.qza \
  --m-metadata-file metadata_samples.txt \
  --p-where 'sample_type != "negative control"' \
  --o-filtered-table centrifuge_wol_norm_clean_noControls.qza

qiime feature-table summarize \
  --i-table centrifuge_wol_norm_clean_noControls.qza \
  --o-visualization centrifuge_wol_norm_clean_noControls.qzv


## Normalize sampling effort
### NOTE: Change the value on line 5 to the desired minimum frequency per sample

In [None]:
%%bash

qiime feature-table filter-samples \
  --i-table centrifuge_wol_norm_clean_noControls.qza \
  --p-min-frequency 60000 \
  --o-filtered-table centrifuge_wol_norm_clean_noControls_normalized.qza \

qiime feature-table summarize \
  --i-table centrifuge_wol_norm_clean_noControls_normalized.qza \
  --o-visualization centrifuge_wol_norm_clean_noControls_normalized.qzv \


## Estimate alpha-diversity

In [None]:
!qiime diversity alpha \
  --i-table centrifuge_wol_norm_clean_noControls_normalized.qza \
  --p-metric 'observed_features' \
  --o-alpha-diversity centrifuge_wol_norm_clean_noControls_normalized_alpha_richness.qza


## Estimate beta-diversity

In [None]:
%%bash

qiime deicode rpca \
  --i-table centrifuge_wol_norm_clean_noControls_normalized.qza \
  --p-min-feature-count 0 \
  --p-min-sample-count 0 \
  --o-biplot centrifuge_wol_norm_clean_noControls_normalized_rpca_biplot.qza \
  --o-distance-matrix centrifuge_wol_norm_clean_noControls_normalized_rpca_dist.qza
    
qiime emperor biplot \
  --i-biplot centrifuge_wol_norm_clean_noControls_normalized_rpca_biplot.qza \
  --m-sample-metadata-file metadata_samples.txt \
  --m-feature-metadata-file wol_taxonomy.qza \
  --p-number-of-features 10 \
  --o-visualization centrifuge_wol_norm_clean_noControls_normalized_rpca_biplot.qzv


# 3. Compare taxon profiles between short-read and long-read data

## Mantel test

In [None]:
!qiime diversity mantel \
  --i-dm1 woltka_wol_biom_noControls_normalized_rpca_dist.qza \
  --i-dm2 centrifuge_wol_norm_clean_noControls_normalized_rpca_dist.qza \
  --p-method 'spearman' \
  --p-label1 'short-read - RPCA' \
  --p-label2 'long-read - RPCA' \
  --p-intersect-ids \
  --o-visualization wol_short_vs_long_read_normalized_rpca_dist_mantel_spearman.qzv


# 4. Generate assemblies for short-read and long-read data

### Short-read data
#### NOTE: Spades v3.15.1

In [None]:
%%bash

module load spades_3.15.1

sample_list='/path/to/sample_list.txt'
input_path='/path/to/raw_per_sample_fastq/'
file_suffix_r1='_L001_R1_001.trimmed.fastq.gz'
file_suffix_r2='_L001_R2_001.trimmed.fastq.gz'
output_path='/path/to/assembly_short_read/'

for i in $(cat < "$sample_list");
do
  metaspades.py \
    -1 "$input_path""$i""$file_suffix_r1" \
    -2 "$input_path""$i""$file_suffix_r2" \
    --threads 1 \
    --memory 1 \
    -o "$output_path""$i"
done


### Long-read data
#### NOTE: Flye v2.8.3-b1695

In [None]:
%%bash

conda activate flye

sample_list='/path/to/sample_list.txt'
input_path='/path/to/fastq_concatenated/'
input_suffix='_all.fastq.gz'
output_path='/path/to/assembly_long_read/'

for i in $(cat < "$sample_list");
do
flye \
  --nano-raw "$input_path""$i""$input_suffix" \
  --out-dir "$output_path""$i"/ \
  --keep-haplotypes \
  --threads 1 \
  --meta
done


# 5. Compare assemblies between short-read and long-read data


## Quast

### Copy assemblies into a single directory

In [None]:
# Short-read assemblies

%%bash

sample_list='/path/to/sample_list.txt'
input_path='/path/to/short_read_assemblies/'
input_suffix='/scaffolds.fasta'
output_path='/path/to/all_assemblies/'
short_read_suffix='_short_read.fasta'

for i in $(cat < "$sample_list");
do
cp "$input_path""$i""$input_suffix" "$output_path""$i""$short_read_suffix"
done


In [None]:
# Long-read assemblies

%%bash

sample_list='/path/to/sample_list.txt'
input_path='/path/to/long_read_assemblies/'
input_suffix='/assembly.fasta'
output_path='/path/to/all_assemblies/'
short_read_suffix='_long_read.fasta'

for i in $(cat < "$sample_list");
do
cp "$input_path""$i""$input_suffix" "$output_path""$i""$long_read_suffix"
done


### Run Quast

In [None]:
%%bash

conda activate quast

sample_list='/path/to/sample_list_all_assemblies.txt'
input_path='/path/to/all_assemblies/'
output_path='/path/to/quast_reports/'

for i in $(cat < "$sample_list");
do
quast \
  --output-dir "$output_path""$i" \
  --est-ref-size 5000000 \
  --threads 1 \
  "$input_path""$i"
done


## CheckM

In [None]:
%%bash

conda activate checkm

input_path='/path/to/all_assemblies/'
output_path='/path/to/checkm_reports/'

checkm \
  taxonomy_wf domain Bacteria \
  "$input_path" \
  -x 'fasta' \
  --threads 1 \
  "$output_path"


## antiSMASH
### NOTE: antiSMASH 5

In [None]:
%%bash

source activate antismash

sample_list='/path/to/sample_list.txt'
input_path='/path/to/all_assemblies/'
output_path='/path/to/antismash_output/'

for i in $(cat < "$sample_list");
do
  antismash \
    --cpus 1 \
    --taxon bacteria \
    --smcog-trees \
    --cb-general \
    --cb-subclusters \
    --cb-knownclusters \
    --pfam2go \
    --output-dir "$output_path""$i"/ \
    --genefinding-tool prodigal-m \
    "$input_path""$i".fasta
done


## Move antiSMASH results from each sample into a single folder

In [None]:
# Clusterblast

%%bash

sample_list='/path/to/sample_list.txt'
input_path='/path/to/antismash_output/'
output_path='/path/to/antismash_output_clusterblast/'

for i in $(cat < "$sample_list");
do
  cp "$input_path""$i"/clusterblastoutput.txt "$output_path""$i"_clusterblastoutput.txt
done


## Merge results from each platform into a single feature-table

In [None]:
%%bash

# Short-read

python ogu_from_maps.py \
/path/to/antismash_output_clusterblast/ \
  antismash_clusters_short_read \
  -e "short_read_clusterblastoutput.txt"

# Long-read
python ogu_from_maps.py \
/path/to/antismash_output_clusterblast/ \
  antismash_clusters_long_read \
  -e "long_read_clusterblastoutput.txt"


## Convert tables to BIOM format

In [None]:
%%bash

# Short-read
biom convert \
  -i antismash_clusters_short_read.norm.tsv \
  -o antismash_clusters_short_read_norm_biom.biom \
  --table-type="OTU table" \
  --to-hdf5

# Long-read
biom convert \
  -i antismash_clusters_long_read.norm.tsv \
  -o antismash_clusters_long_read_norm_biom.biom \
  --table-type="OTU table" \
  --to-hdf5


## Import into QIIME2

In [None]:
%%bash

# Short-read
qiime tools import \
  --input-path antismash_clusters_short_read_norm_biom.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path antismash_clusters_short_read_norm_biom.qza

# Long-read
qiime tools import \
  --input-path antismash_clusters_long_read_norm_biom.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path antismash_clusters_long_read_norm_biom.qza


## Normalize sampling depth

In [None]:
%%bash

# Short-read
qiime feature-table filter-samples \
  --i-table antismash_clusters_short_read_norm_biom.qza \
  --p-min-frequency X \
  --o-filtered-table antismash_clusters_short_read_norm_biom_normalized.qza \

# Long-read
qiime feature-table filter-samples \
  --i-table antismash_clusters_short_read_norm_biom.qza \
  --p-min-frequency X \
  --o-filtered-table antismash_clusters_long_read_norm_biom_normalized.qza \


## Beta-diversity (RPCA)

In [None]:
%%bash

# Short-read
qiime deicode rpca \
  --i-table antismash_clusters_short_read_norm_biom_normalized.qza \
  --p-min-feature-count 0 \
  --p-min-sample-count 0 \
  --o-biplot antismash_clusters_short_read_norm_biom_normalized_rpca_biplot.qza \
  --o-distance-matrix antismash_clusters_short_read_norm_biom_normalized_rpca_dist.qza
  
# Long-read
qiime deicode rpca \
  --i-table antismash_clusters_long_read_norm_biom_normalized.qza \
  --p-min-feature-count 0 \
  --p-min-sample-count 0 \
  --o-biplot antismash_clusters_long_read_norm_biom_normalized_rpca_biplot.qza \
  --o-distance-matrix antismash_clusters_long_read_norm_biom_normalized_rpca_dist.qza
  

## Mantel test

In [None]:
!qiime diversity mantel \
  --i-dm1 antismash_clusters_short_read_norm_biom_normalized_rpca_dist.qza \
  --i-dm2 antismash_clusters_long_read_norm_biom_normalized_rpca_dist.qza \
  --p-method 'spearman' \
  --p-label1 'short-read - RPCA' \
  --p-label2 'long-read - RPCA' \
  --p-intersect-ids \
  --o-visualization antismash_clusters_short_vs_long_read_normalized_rpca_dist_mantel_spearman.qzv


# 6. Polish assemblies from long-read data with short-read data

## 6.a. Map short-reads against long-read assemblies

### Create bowtie2 databases from long-read assemblies

In [None]:
module load bowtie2_bowtie2-2.3.2

sample_list='/path/to/sample_list.txt'
assembly_path='/path/to/long_read_assemblies/'
assembly_suffix='assembly.fasta'
output_path='/path/to/bowtie2_index/'

for i in $(cat < "$sample_list");
do
    bowtie2-build \
      "$assembly_path""$i""$assembly_suffix" \
      "$output_path""$assembly_prefix""$i" \
      -f
done


### Map short-reads

In [None]:
# on barnacle...

module load bowtie2_bowtie2-2.3.2

sample_list='/path/to/sample_list.txt'
index_path='path/to/bowtie2_index/'
fastq_path='/path/to/short_read_fastq/'
input_fastq_r1_suffix='_L001_R1_001.trimmed.fastq.gz'
input_fastq_r2_suffix='_L001_R2_001.trimmed.fastq.gz'
output_path='/path/to/short_read_mappings/'
log_path='/path/to/bowtie2_logs/'

for i in $(cat < "$sample_list");
do
    bowtie2 \
      -x "$index_path""$i" \
      -p 1 \
      --very-sensitive \
      -1 "$fastq_path""$i""$input_fastq_r1_suffix" \
      -2 "$fastq_path""$i""$input_fastq_r2_suffix" \
      2> "$log_path"bowtie2_log_"$i".txt \
      -S "$output_path""$i""$aligned_sam".sam
done


## 6.b. Sort and index sam files for input to Pilon

In [None]:
module load samtools_1.12

sample_list='/path/to/sample_list.txt'
file_path='/path/to/short_read_mappings/'
log_path='/path/to/samtools_logs/'

for i in $(cat < "$sample_list");
do
    samtools view \
      -bS "$file_path""$i".sam \
      -o "$file_path""$i"_unsorted.bam \
      2> "$log_path"samtools_log_view_"$i".txt

    samtools sort \
      -T "$file_path""$i" \
      -@ 1 \
      -o "$file_path""$i"_sorted.bam \
      "$file_path""$i"_unsorted.bam \
      2> "$log_path"samtools_log_sort_"$i".txt
    
    samtools index \
      -@ 1 \
      "$file_path""$i"_sorted.bam \
      "$file_path""$i"_sorted.bam.bai \
      2> "$log_path"samtools_log_index_"$i".txt
done


## 6.c. Install and run pilon

In [None]:
conda create -n pilon python=3.6
conda activate pilon
conda install -c bioconda pilon


In [None]:
# Note: Change the value on line 19 to the total amount of memory available across all threads
conda activate pilon

sample_list='/path/to/sample_list.txt'
assembly_path='/path/to/long_read_assemblies/'
assembly_suffix='.fasta'
bam_path='/path/to/short_read_mappings/'
output_path='/path/to/polished_assemblies/'
polished_assembly_prefix='polished_'

for i in $(cat < "$sample_list");
do
    pilon \
      --genome "$assembly_path""$assembly_prefix""$i""$assembly_suffix" \
      --bam "$bam_path""$i"_sorted.bam \
      --output "$polished_assembly_prefix""$i" \
      --outdir "$output_path" \
      --changes \
      -Xmx128G
done
