# Assembly

**Megahit**
https://www.metagenomics.wiki/tools/assembly/megahit
- de novo assembly (w/o reference genome)
- aligns/assembles short reads together to reconstruct one 'metagenome'
- assembled contigs are stored in fasta file

### Installation

In [None]:
module load miniconda/22.11.1-1

In [None]:
conda create -n assembly
#dir=/home/brooke_sienkiewicz_student_uml_edu/.conda/envs/assembly

In [None]:
#conda info --env
##lists all ur conda envs 
conda activate assembly

In [None]:
#installation - just do the first time upon creating assembly env
conda install -c bioconda megahit
conda install -c bioconda quast python=2.7

### MCAV

#### MCAV - healthy, 2019

In [None]:
# Using trimmed, qc seqs from redo_auto_detect_01312024 folder
# 1)remove host from sample reads
# 2)concatenate all f and r seqs into single file (1 for f, 1 for r)
# 3)ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps - larger portions of genomes if not all are now together in one sequence)
# 4)remove ITS2 seqs from assembled contigs (& remove adapters) ... should try to perform on raw reads so we end up with just one final contig file 

# can definitely combine these into 1 or 2 batch scripts

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/slurm-removal-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate anvio-8

# 1)remove host from sample reads
# Host seq removal - Thij's script https://github.com/ThijsSt/SCTLD-metagenomes/blob/main/Quality_control_metagenomes.ipynb

#set parameters:
SAMPLENAME="healthy_2019_mcav"
GENOME="mcav"
READSPATH=/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024
INDEX="$GENOME"_DB
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/Mcav_genome"
WORKINGPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/host_removed'

#build a bowtie2 index from a known genome
bowtie2-build $INPUTPATH/Mcavernosa_July2018.fasta $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do

#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$READSPATH"/"${SAMPLEID}_R1_001_val_1.fq" -2 "$READSPATH"/"${SAMPLEID}_R2_001_val_2.fq" -S $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.sam

#convert sam file from bowtie to a bam file for processing
samtools view -bS $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.bam

#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
#ask thijs what flags mean 

# sorts the file so both mates are together and then extracts them back as .fastq files
samtools sort -n -m 5G -@ 2 $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -@ 8 $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 "${SAMPLEID}"_host_removed_R1.fastq \
    -2 "${SAMPLEID}"_host_removed_R2.fastq \
    -0 /dev/null -s /dev/null -n
#can i direct these to a diff folder?

done < "healthy_2019_MCAV_sampleids"
#run in dir with sampleids txt file (~/mcav/assembly/healthy_2019_mcav)

# JOB-ID: 19358781
# bash script file name: brooke/mcav/assembly/healthy_2019_mcav/removal

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 56:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/slurm-assembly-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate assembly
# 2)concatenate all f and r seqs into single file (1 for f, 1 for r)

READSPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/host_removed'
SAMPLENAME="healthy_2019_mcav"
OUTDIR=/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav

# Read the sample IDs from the file
while IFS= read -r SAMPLEID; do
    # Construct the file paths for forward and reverse reads
    FORWARD_READ="$READSPATH/${SAMPLEID}_host_removed_R1.fastq"
    REVERSE_READ="$READSPATH/${SAMPLEID}_host_removed_R2.fastq"

    # Check if the files exist before concatenating
    if [ -e "$FORWARD_READ" ]; then
        cat "$FORWARD_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R1_ALL.fastq"
    else
        echo "Forward read file not found for sample $SAMPLEID"
    fi

    if [ -e "$REVERSE_READ" ]; then
        cat "$REVERSE_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R2_ALL.fastq"
    else
        echo "Reverse read file not found for sample $SAMPLEID"
    fi
done < "$OUTDIR/healthy_2019_MCAV_sampleids"

# 3)ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps
megahit --presets meta-large \
-1 "$OUTDIR"/"$SAMPLENAME"_reads_R1_ALL.fastq \
-2 "$OUTDIR"/"$SAMPLENAME"_reads_R2_ALL.fastq \
--keep-tmp-files \
-o megahit_host_removed --out-prefix $SAMPLENAME \
#--continue
#this one has to make the directory; will fail if it already exists

# try metavelvet next? 

# JOB-ID: 19388626
# bash script file name: $OUTDIR/assembly

Total time elapsed: 46 hrs 
looks like 190 GB is enough... and 24 CPU - 90% efficiency on last part of run 

In [None]:
# NEED TO: Rename concatenated seqs so it can just be done in 1 step instead of renaming all indiividual files 
# (qc step it adds "val_1/2" to each seq file)

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/slurm-%j.out  # %j = job ID


#load modules
module load miniconda/22.11.1-1
conda activate cutadaptenv

# 4)remove ITS2 seqs from assembled contigs (& remove adapters)

# Set your input and output files
SAMPLENAME="healthy_2019_mcav"
INPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/megahit_host_removed"
OUTPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav"


#READSPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024'

input_fasta="$SAMPLENAME.contigs.fa"
#output_fasta="${SAMPLENAME}_filtered.contigs.fasta"
#couldn't get this to work?? just typed it out for the -o parameter in cutadapt script

# Set your primer sequences
forward_primer="GAATTGCAGAACTCCGTGAACC"
reverse_primer="CGGGTTCWCTTGTYTGACTTCATGC"

# Verify path and input dir - only need for troubleshooting
echo "Working Directory: $(pwd)"
#ls -l $INPUTDIR

# Run cutadapt
cutadapt \
  -g "$forward_primer" \
  -a "$reverse_primer" \
  --discard-trimmed \
  -o $OUTPUTDIR/"{$SAMPLENAME}_filtered" \
  $INPUTDIR/"$input_fasta"
  
ls -l $OUTPUTDIR
#check results dir to see if it was successful in creating output file 

# JOB-ID: 19420673
# bash script file name: brooke/mcav/assembly/healthy_2019_mcav/ITS2_trim


#final assembled contigs: healthy_2019_mcav_filtered.contigs.fasta in ~/brooke/mcav/assembly/healthy_2019_mcav

#### MCAV - diseased

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/slurm-removal-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate anvio-8

# 1)remove host from sample reads
# Host seq removal - Thij's script https://github.com/ThijsSt/SCTLD-metagenomes/blob/main/Quality_control_metagenomes.ipynb

#set parameters:
SAMPLENAME="diseased_mcav"
GENOME="mcav"
READSPATH=/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024
INDEX="$GENOME"_DB
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/Mcav_genome"
WORKINGPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/host_removed'

mkdir $WORKINGPATH
#build a bowtie2 index from a known genome
bowtie2-build $INPUTPATH/Mcavernosa_July2018.fasta $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do

#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$READSPATH"/"${SAMPLEID}_R1_001_val_1.fq" -2 "$READSPATH"/"${SAMPLEID}_R2_001_val_2.fq" -S $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.sam

#convert sam file from bowtie to a bam file for processing
samtools view -bS $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.bam

#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $WORKINGPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
#ask thijs what flags mean 

# sorts the file so both mates are together and then extracts them back as .fastq files
samtools sort -n -m 5G -@ 2 $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -@ 8 $WORKINGPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 "${SAMPLEID}"_host_removed_R1.fastq \
    -2 "${SAMPLEID}"_host_removed_R2.fastq \
    -0 /dev/null -s /dev/null -n
#can i direct these to a diff folder?

done < "diseased_MCAV"
#run in dir with sampleids txt file (~/mcav/diseased_mcav)

# JOB-ID: 21639875
# bash script file name: brooke/mcav/diseased_mcav/assembly/host_removal

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 56:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/slurm-assembly-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate assembly
# 2)concatenate all f and r seqs into single file (1 for f, 1 for r)

READSPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/host_removed'
SAMPLENAME="diseased_mcav"
OUTDIR='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly'

# Read the sample IDs from the file
while IFS= read -r SAMPLEID; do
    # Construct the file paths for forward and reverse reads
    FORWARD_READ="$READSPATH/${SAMPLEID}_host_removed_R1.fastq"
    REVERSE_READ="$READSPATH/${SAMPLEID}_host_removed_R2.fastq"

    # Check if the files exist before concatenating
    if [ -e "$FORWARD_READ" ]; then
        cat "$FORWARD_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R1_ALL.fastq"
    else
        echo "Forward read file not found for sample $SAMPLEID"
    fi

    if [ -e "$REVERSE_READ" ]; then
        cat "$REVERSE_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R2_ALL.fastq"
    else
        echo "Reverse read file not found for sample $SAMPLEID"
    fi
done < "diseased_MCAV"

# 3)ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps
megahit --presets meta-large \
-1 "$OUTDIR"/"$SAMPLENAME"_reads_R1_ALL.fastq \
-2 "$OUTDIR"/"$SAMPLENAME"_reads_R2_ALL.fastq \
--keep-tmp-files \
-o $OUTDIR/contigs --out-prefix $SAMPLENAME \
#--continue
#this one has to make the directory; will fail if it already exists


# 4)remove ITS2 seqs from assembled contigs (& remove adapters)
conda deactivate assembly 
conda activate cutadaptenv

# Set your input and output files
INPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/contigs"
OUTPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly"

input_fasta="$SAMPLENAME.contigs.fa"
#output_fasta="${SAMPLENAME}_filtered.contigs.fasta"
#couldn't get this to work?? just typed it out for the -o parameter in cutadapt script

# Set your primer sequences
forward_primer="GAATTGCAGAACTCCGTGAACC"
reverse_primer="CGGGTTCWCTTGTYTGACTTCATGC"

# Verify path and input dir - only need for troubleshooting
echo "Working Directory: $(pwd)"
#ls -l $INPUTDIR

# Run cutadapt
cutadapt \
  -g "$forward_primer" \
  -a "$reverse_primer" \
  --discard-trimmed \
  -o $OUTPUTDIR/"{$SAMPLENAME}_filtered" \
  $INPUTDIR/"$input_fasta"
  
ls -l $OUTPUTDIR
#check results dir to see if it was successful in creating output file 


# JOB-ID: 21644151
# bash script file name: brooke/mcav/diseased_mcav/assembly/assembly


#cutadapt script failed so redoing in own bash script:
# JOB-ID: 21680100
# bash script file name: brooke/mcav/diseased_mcav/assembly/ITS2_filtering

#final assembled contigs: diseased_mcav_filtered.contigs.fasta in ~/brooke/mcav/diseased_mcav/assembly/diseased_mcav_filtered.contigs.fasta


=== Summary ===

Total reads processed:               2,122,290
Reads with adapters:                    63,973 (3.0%)

== Read fate breakdown ==
Reads discarded as trimmed:             63,973 (3.0%)
Reads written (passing filters):     2,058,317 (97.0%)

Total basepairs processed: 1,821,385,837 bp
Total written (filtered):  1,769,144,061 bp (97.1%)

=== Adapter 1 ===

Sequence: GAATTGCAGAACTCCGTGAACC; Type: regular 5'; Length: 22; Trimmed: 39801 times

Minimum overlap: 3
No. of allowed errors:
1-9 bp: 0; 10-19 bp: 1; 20-22 bp: 2

Overview of removed sequences
length	count	expect	max.err	error counts
3	27460	33160.8	0	27460
4	9697	8290.2	0	9697
5	1947	2072.5	0	1947
6	469	518.1	0	469
7	106	129.5	0	106
8	23	32.4	0	23
9	30	8.1	0	3 27
10	43	2.0	1	0 43
11	12	0.5	1	0 12
12	4	0.1	1	0 4
13	2	0.0	1	0 2
14	1	0.0	1	0 1
40	2	0.0	2	2
316	1	0.0	2	0 1
443	1	0.0	2	0 0 1
548	1	0.0	2	0 1
580	1	0.0	2	1
727	1	0.0	2	1


=== Adapter 2 ===

Sequence: CGGGTTCWCTTGTYTGACTTCATGC; Type: regular 3'; Length: 25; Trimmed: 24172 times

Minimum overlap: 3
No. of allowed errors:
1-9 bp: 0; 10-19 bp: 1; 20-25 bp: 2

Bases preceding removed adapters:
  A: 22.4%
  C: 23.5%
  G: 24.4%
  T: 29.7%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
3	18071	33160.8	0	18071
4	4756	8290.2	0	4756
5	799	2072.5	0	799
6	296	518.1	0	296
7	64	129.5	0	64
8	30	32.4	0	30
9	34	8.1	0	12 22
10	73	2.0	1	2 71
11	25	0.5	1	0 25
12	10	0.1	1	0 10
13	2	0.0	1	0 2
14	2	0.0	1	0 2
15	1	0.0	1	0 1
19	2	0.0	1	0 0 2
74	1	0.0	2	0 1
76	1	0.0	2	1
163	1	0.0	2	1
204	1	0.0	2	1
311	1	0.0	2	0 0 1
337	1	0.0	2	0 0 1
1314	1	0.0	2	0 1

# Quality Check
**Metaquast**
- quality assessment of metagenomic reads, no reference genome included here

https://quast.sourceforge.net/docs/manual.html#sec1
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3624806/
..how to interpret quality results?
- check how many large contigs you have (>1000 bp)
- did not map to reference genome.
- right now just helpful to see length and quality of contigs, maybe can reassess after mapping back to metagenome?
Cite metaquast: https://quast.sourceforge.net/publications.html

In [None]:
module load miniconda/22.11.1-1
conda activate assembly

In [None]:
metaquast healthy_2019_mcav_filtered.contigs.fasta -o quast_output
# find way to streamline deleting excess files because we only want to look at contig stats unaligned without reference genomes

In [None]:
# mcav, healthy, 2019 sample assembly (host and ITS2 removal)
# /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/healthy_2019_mcav/quast_output/not_aligned/report.txt

All statistics are based on contigs of size >= 500 bp, unless otherwise noted (e.g., "# contigs (>= 0 bp)" and "Total length (>= 0 bp)" include all contigs).

Assembly                    healthy_2019_mcav_filtered.contigs
# contigs (>= 1000 bp)      447572                            
# contigs (>= 5000 bp)      42741                             
# contigs (>= 10000 bp)     9795                              
# contigs (>= 25000 bp)     681                               
# contigs (>= 50000 bp)     46                                
Total length (>= 1000 bp)   1185480790                        
Total length (>= 5000 bp)   373241850                         
Total length (>= 10000 bp)  153130609                         
Total length (>= 25000 bp)  24831349                          
Total length (>= 50000 bp)  5071479                           
# contigs                   1039025                           
Largest contig              437957                            
Total length                1589447609                        
GC (%)                      42.72                             
N50                         2169                              
N75                         987                               
L50                         176118                            
L75                         454223                            
# N's per 100 kbp           0.00      
 

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 56:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/slurm-metaquast-%j.out  # %j = job ID

#### MCAV - diseased
module load miniconda/22.11.1-1
conda activate assembly
metaquast diseased_mcav_filtered.contigs.fasta -t 12 -o quast_output

#JOB ID: 21682141

In [None]:
cd //project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/quast_output
rm -r krona_charts quast_downloaded_references runs_per_reference icarus_viewers summary/TEX summary/TSV combined_reference/icarus_viewers combined_reference/aligned_stats
rm combined_reference/*.tex combined_reference/*.tsv
rm not_aligned/*.tex not_aligned/*.tsv

In [None]:
# mcav diseased sample assembly (host and ITS2 removal)
# /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/quast_output/not_aligned/report.txt

All statistics are based on contigs of size >= 500 bp, unless otherwise noted (e.g., "# contigs (>= 0 bp)" and "Total length (>= 0 bp)" include all contigs).

Assembly                    diseased_mcav_filtered.contigs
# contigs (>= 1000 bp)      386921                        
# contigs (>= 5000 bp)      43604                         
# contigs (>= 10000 bp)     9279                          
# contigs (>= 25000 bp)     557                           
# contigs (>= 50000 bp)     10                            
Total length (>= 1000 bp)   1089301896                    
Total length (>= 5000 bp)   369050266                     
Total length (>= 10000 bp)  139814218                     
Total length (>= 25000 bp)  17768633                      
Total length (>= 50000 bp)  597200                        
# contigs                   787579                        
Largest contig              76539                         
Total length                1363359090                    
GC (%)                      42.38                         
N50                         2620                          
N75                         1197                          
L50                         132007                        
L75                         325806                        
# N's per 100 kbp           0.00         

#smaller sample size - looks comparable to healthy mcav 