In [1]:
NOTEBOOK_DIR=`git rev-parse --show-toplevel`
ROOT_DIR=$NOTEBOOK_DIR/microbial
cd $ROOT_DIR
ls

all_kat_hist                            [0m[01;34mimages[0m
all_kat_hist.dist_analysis.json         kat.hist
[01;35mall_kat_hist.png[0m                        kat.hist.dist_analysis.json
[01;34mbigsi[0m                                   [01;35mkat.hist.png[0m
[01;34mbigsi.bak[0m                               microbial-assembly.ipynb
[01;34mdata[0m                                    microbial-bigsi.ipynb
ERR1144974                              microbial-genomes.txt
ERR1144974.dist_analysis.json           microbial-process-data.ipynb
ERR1144974_kat_hist                     microbial-process-data.ipynb.bak
ERR1144974_kat_hist.dist_analysis.json  [01;35mo.png[0m
[01;35mERR1144974_kat_hist.png[0m                 sha256
[01;35mERR1144974.png[0m                          total-bp-reads.txt
freq_k31.hist                           total-kmers.txt
freq_k7.hist


# Step 1: Assemble genomes

Let's try assembling the genomes using [skesa](https://github.com/ncbi/SKESA).

In [3]:
threads=48

In [5]:
mkdir assembly

conda run --name skesa skesa --version

data_dir=data/subsample
for file in ${data_dir}/*_1.fastq.gz
do
    name=`basename $file _1.fastq.gz`
    
    input1=${data_dir}/${name}_1.fastq.gz
    input2=${data_dir}/${name}_2.fastq.gz
    
    output=assembly/${name}.fasta
    log=assembly/${name}.log
    
    echo "${name}"
    
    conda run --name skesa /usr/bin/time -v -o assembly/${name}.time \
        skesa --cores ${threads} --fastq ${input1},${input2} --contigs_out ${output} 1> ${log}.out 2> ${log}.err
done
echo "Done"

mkdir: cannot create directory ‘assembly’: File exists
SKESA v.2.3.0
skesa --version 

ERR1144974 ... ERR1144975 ... ERR1144976 ... ERR1144977 ... ERR1144978 ... ERR3655992 ... ERR3655994 ... ERR3655996 ... ERR3655998 ... ERR3656002 ... ERR3656004 ... ERR3656010 ... ERR3656012 ... ERR3656013 ... ERR3656015 ... ERR3656018 ... ERR3656019 ... SRR10298903 ... SRR10298904 ... SRR10298905 ... SRR10298906 ... SRR10298907 ... SRR10512964 ... SRR10512965 ... SRR10512968 ... SRR10513325 ... SRR10513326 ... SRR10513328 ... SRR10513363 ... SRR10513672 ... SRR10519468 ... SRR10519469 ... SRR10519616 ... SRR10519617 ... SRR10519619 ... SRR10519620 ... SRR10519637 ... SRR10521982 ... SRR10521983 ... SRR10521984 ... SRR10527348 ... SRR10527349 ... SRR10527351 ... SRR10527352 ... SRR10527353 ... SRR8088181 ... SRR8088182 ... SRR8088183 ... SRR8088184 ... SRR8088185 ... done


In [6]:
ls -lh assembly/*.fasta | head
ls assembly/*.fasta | wc -l

-rw-r--r-- 1 apetkau grp_apetkau 4.2M Dec  4 17:47 assembly/ERR1144974.fasta
-rw-r--r-- 1 apetkau grp_apetkau 4.2M Dec  4 17:48 assembly/ERR1144975.fasta
-rw-r--r-- 1 apetkau grp_apetkau 4.1M Dec  4 17:48 assembly/ERR1144976.fasta
-rw-r--r-- 1 apetkau grp_apetkau 4.1M Dec  4 17:49 assembly/ERR1144977.fasta
-rw-r--r-- 1 apetkau grp_apetkau 4.1M Dec  4 17:50 assembly/ERR1144978.fasta
-rw-r--r-- 1 apetkau grp_apetkau 6.7M Dec  4 17:52 assembly/ERR3655992.fasta
-rw-r--r-- 1 apetkau grp_apetkau 6.8M Dec  4 17:54 assembly/ERR3655994.fasta
-rw-r--r-- 1 apetkau grp_apetkau 6.2M Dec  4 17:56 assembly/ERR3655996.fasta
-rw-r--r-- 1 apetkau grp_apetkau 6.3M Dec  4 17:58 assembly/ERR3655998.fasta
-rw-r--r-- 1 apetkau grp_apetkau 6.4M Dec  4 18:00 assembly/ERR3656002.fasta
50


Awesome. We've got assembled genomes.

# Step 2: Evaluate quality

Let's evaluate the quality using [quast](http://bioinf.spbau.ru/quast).

In [16]:
conda run --name quast quast -o assembly/quast assembly/*.fasta

/home/CSCScience.ca/apetkau/miniconda3/envs/quast/bin/quast -o assembly/quast assembly/ERR1144974.fasta assembly/ERR1144975.fasta assembly/ERR1144976.fasta assembly/ERR1144977.fasta assembly/ERR1144978.fasta assembly/ERR3655992.fasta assembly/ERR3655994.fasta assembly/ERR3655996.fasta assembly/ERR3655998.fasta assembly/ERR3656002.fasta assembly/ERR3656004.fasta assembly/ERR3656010.fasta assembly/ERR3656012.fasta assembly/ERR3656013.fasta assembly/ERR3656015.fasta assembly/ERR3656018.fasta assembly/ERR3656019.fasta assembly/SRR10298903.fasta assembly/SRR10298904.fasta assembly/SRR10298905.fasta assembly/SRR10298906.fasta assembly/SRR10298907.fasta assembly/SRR10512964.fasta assembly/SRR10512965.fasta assembly/SRR10512968.fasta assembly/SRR10513325.fasta assembly/SRR10513326.fasta assembly/SRR10513328.fasta assembly/SRR10513363.fasta assembly/SRR10513672.fasta assembly/SRR10519468.fasta assembly/SRR10519469.fasta assembly/SRR10519616.fasta assembly/SRR10519617.fasta assembly/SRR10519619.

In [17]:
ls assembly/quast

[0m[01;34mbasic_stats[0m     quast.log    report.tex  transposed_report.tex
icarus.html     report.html  report.tsv  transposed_report.tsv
[01;34micarus_viewers[0m  report.pdf   report.txt  transposed_report.txt


# Step 3: Search for AMR genes

Let's use [staramr](https://github.com/phac-nml/staramr) to search for antimicrobial resistance genes in all assemblies.

In [7]:
conda run --name staramr staramr --version
conda run --name staramr staramr search -o assembly/staramr --exclude-negatives assembly/*.fasta

staramr 0.7.0
2019-12-04 18:47:33 INFO: No --pointfinder-organism specified. Will not search the PointFinder databases
2019-12-04 18:47:33 INFO: No --plasmidfinder-database-type specified. Will search the entire PlasmidFinder database
2019-12-04 18:47:33 INFO: --output-dir set. All files will be output to [assembly/staramr]
2019-12-04 18:47:33 INFO: Will exclude ResFinder/PointFinder genes listed in [/home/CSCScience.ca/apetkau/miniconda3/envs/staramr/lib/python3.7/site-packages/staramr/databases/exclude/data/genes_to_exclude.tsv]. Use --no-exclude-genes to disable
2019-12-04 18:47:33 INFO: Making BLAST databases for input files
2019-12-04 18:47:34 INFO: Scheduling blasts and MLST for ERR1144974.fasta
2019-12-04 18:47:34 INFO: Scheduling blasts and MLST for ERR1144975.fasta
2019-12-04 18:47:34 INFO: Scheduling blasts and MLST for ERR1144976.fasta
2019-12-04 18:47:34 INFO: Scheduling blasts and MLST for ERR1144977.fasta
2019-12-04 18:47:34 INFO: Scheduling blasts and MLST for ERR1144978

Let's take a look at the results.

In [10]:
column -s$'\t' -t assembly/staramr/plasmidfinder.tsv

Isolate ID   Plasmid           %Identity  %Overlap  HSP Length/Total Length  Contig                 Start  End    Accession
ERR3655994   Col440II          100.00     100.00    282/282                  Contig_2_23.4654_Circ  3395   3676   CP023921.1
ERR3655996   IncQ1             100.00     64.20     511/796                  Contig_71_5.84164      511    1      M28829.1
ERR3655998   Col440II          100.00     100.00    282/282                  Contig_5_18.1039       4555   4836   CP023921.1
ERR3656002   IncI              100.00     97.16     137/141                  Contig_69_7.05375      235    371    AP011954
ERR3656018   ColRNAI           99.23      100.00    130/130                  Contig_3_159.311_Circ  1104   975    DQ298019
ERR3656018   IncFIA(HI1)       98.45      99.74     387/388                  Contig_40_61.9395      5334   4948   AF250878
ERR3656018   IncFIB(K)         100.00     100.00    560/560                  Contig_50_53.1909      8451   7892   JN233704
ERR3656018 

This gives us a list of all plasmids (specifically plasmid incompatibility factors) that were found using BLAST. We can make use of these to test out BIGSI/HowDeSBT.

For example, let's take `Col440II`:

In [11]:
(head -n 1 assembly/staramr/plasmidfinder.tsv; grep 'Col440II' assembly/staramr/plasmidfinder.tsv) | column -s$'\t' -t

Isolate ID  Plasmid   %Identity  %Overlap  HSP Length/Total Length  Contig                 Start  End   Accession
ERR3655994  Col440II  100.00     100.00    282/282                  Contig_2_23.4654_Circ  3395   3676  CP023921.1
ERR3655998  Col440II  100.00     100.00    282/282                  Contig_5_18.1039       4555   4836  CP023921.1


There's only two samples where this is found. So, if we take the specfic sequence found:

In [15]:
grep -A5 'Col440II' assembly/staramr/hits/plasmidfinder_ERR3655994.fasta

>[01;31m[KCol440II[m[K_1__CP023921.1 isolate: ERR3655994, contig: Contig_2_23.4654_Circ, contig_start: 3395, contig_end: 3676, database_gene_start: 1, database_gene_end: 282, hsp/length: 282/282, pid: 100.00%, plength: 100.00%
GTCGATTGCCATCAGTGCGGCCACAATCTGCACCCGGTCATGACCGGCACCACCGGCATT
CACTTTCCGGGCGATCTGGTTCAGGTTGTTCCCCATACCGGCAAGCTGGCGCAGCAGCGC
CGGCGATATCGACGGCAGTCTGCCGGCACGCGCCGGCTTCTCATCGAGGCAGGTCTGACG
CATCCACGCCGCCAGCTGCTTGCCGTCGCACCGCTCGAGCAGCCGCCGGTGTTCGTCTTC
CGTCACCCACATCGTGAGCATCTTGTTGCGTTTGTCTGCCAG


we can use this sequence in the BIGSI/HowDeSBT searches to test if they only find it in the two above genomes.