# Assembly

**Megahit**
https://www.metagenomics.wiki/tools/assembly/megahit
- de novo assembly (w/o reference genome)
- aligns/assembles short reads together to reconstruct one 'metagenome'
- assembled contigs are stored in fasta file

### Installation

In [None]:
module load miniconda/22.11.1-1

In [None]:
conda create -n assembly
#dir=/home/brooke_sienkiewicz_student_uml_edu/.conda/envs/assembly

In [None]:
#conda info --env
##lists all ur conda envs 
conda activate assembly

In [None]:
#installation - just do the first time upon creating assembly env
conda install -c bioconda megahit
conda install -c bioconda quast python=2.7

### MCAV

#### MCAV - healthy, 2019

In [None]:
# Using trimmed, qc seqs from redo_auto_detect_01312024 folder

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 56:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/slurm-assembly-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate assembly

READSPATH=/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024
SAMPLENAME="healthy_2019_mcav"
OUTDIR=/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav

# Read the sample IDs from the file
while IFS= read -r SAMPLEID; do
    # Construct the file paths for forward and reverse reads
    FORWARD_READ="$READSPATH/${SAMPLEID}_R1_001_val_1.fq"
    REVERSE_READ="$READSPATH/${SAMPLEID}_R2_001_val_2.fq"

    # Check if the files exist before concatenating
    if [ -e "$FORWARD_READ" ]; then
        cat "$FORWARD_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R1_ALL.fastq"
    else
        echo "Forward read file not found for sample $SAMPLEID"
    fi

    if [ -e "$REVERSE_READ" ]; then
        cat "$REVERSE_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R2_ALL.fastq"
    else
        echo "Reverse read file not found for sample $SAMPLEID"
    fi
done < "$OUTDIR/healthy_2019_MCAV_sampleids"


megahit --presets meta-large \
-1 "$OUTDIR"/"$SAMPLENAME"_reads_R1_ALL.fastq \
-2 "$OUTDIR"/"$SAMPLENAME"_reads_R2_ALL.fastq \
--keep-tmp-files \
-o megahit --out-prefix $SAMPLENAME \
#--continue

# try metavelvet next? 

# JOB-ID: 19352098
# bash script file name: $OUTDIR/assembly

# can't find where it's putting output contigs 

Total time elapsed: 46 hrs 
looks like 190 GB is enough... and 24 CPU - 90% efficiency on last part of run 

In [None]:
# NEED TO: Rename concatenated seqs so it can just be done in 1 step instead of renaming all indiividual files 
# (qc step it adds "val_1/2" to each seq file)

In [None]:
# Host seq removal - Thij's script https://github.com/ThijsSt/SCTLD-metagenomes/blob/main/Quality_control_metagenomes.ipynb

#set parameters:
SAMPLENAME=
GENOME=
READSPATH=/path/to/reads/
INDEX="$GENOME"_DB

#this is how you build a bowtie2 index from a known genome:
bowtie2-build ../data/working/M.cavernosa.fasta "$INDEX"

#split out: bowtie2-build path/to/input index-name
for f in <sample1> <sample2> <sample3> <sample4>
do
bowtie2 -p 8 -x ../data/working/$INDEX -1 "$READSPATH"/"$f"_R1_001_val_1.fq -2 "$READSPATH"/"$f"_R2_001_val_2.fq -S ../data/working/"$f"_mapped_and_unmapped.sam
#this re-aligns your reads back to the index;

samtools view -bS ../data/working/"$f"_mapped_and_unmapped.sam > ../data/working/"$f"_mapped_and_unmapped.bam
#this converts the sam file from bowtie to a bam file for processing

samtools view -b -f 12 -F 256 ../data/working/"$f"_mapped_and_unmapped.bam > ../data/working/"$f"_bothReadsUnmapped.bam
#this extracts only the reads of which both do not match against the host genome

samtools sort -n -m 5G -@ 2 ../data/working/"$f"_bothReadsUnmapped.bam -o ../data/working/"$f"_bothReadsUnmapped_sorted.bam
samtools fastq -@ 8 ../data/working/"$f"_bothReadsUnmapped_sorted.bam \
    -1 "$f"_host_removed_R1.fastq \
    -2 "$f"_host_removed_R2.fastq \
    -0 /dev/null -s /dev/null -n

#this sorts the file so both mates are together and then extracts them back as .fastq files

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/bash_scripts/slurm-%j.out  # %j = job ID

# Set your input and output files
SAMPLENAME="healthy_2019_mcav"
INPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/megahit"
OUTPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/healthy_2019_mcav/symb_removed"

READSPATH=/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024


input_fasta="$SAMPLENAME.contigs.fa"
output_fasta="$SAMPLENAME_symb.contigs.fasta"
#couldn't get this to work?? just typed it out for the -o parameter in cutadapt script

#load modules
module load miniconda/22.11.1-1
conda activate cutadaptenv

# Set your primer sequences
forward_primer="GAATTGCAGAACTCCGTGAACC"
reverse_primer="CGGGTTCWCTTGTYTGACTTCATGC"

# Verify path and input dir - only need for troubleshooting
echo "Working Directory: $(pwd)"
#ls -l $INPUTDIR

# Run cutadapt
cutadapt \
  -g "$forward_primer" \
  -a "$reverse_primer" \
  --discard-trimmed \
  -o $OUTPUTDIR/"$output_fasta" \
  $INPUTDIR/"$input_fasta"
  
ls -l $OUTPUTDIR
#check results dir to see if it was successful in creating output file 

# JOB-ID: 
# bash script file name: brooke/mcav/assembly/healthy_2019_mcav/ITS2_trim
