**More QC & Taxonomic Profiling**
( rearranged steps a bit for now) 
- running all PSTR together 

- this follows part of 1Assembly steps and 4Taxonomy
    - host coral, human, and symbiont removal 
    - kraken and bracken read recruitment (taxonomic abundances)
    - will NOT be assembling for now (skipping megahit step)

**See notes on QC in CBC_metagenomics/0QC_trimming-112023 nb**

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 56:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pstr/slurm-removaltax-pstr-%j.out  # %j = job ID

cd /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pstr || exit
module load miniconda/22.11.1-1
conda activate anvio-8

# 1)remove host from sample reads

#set general parameters:
SAMPLENAME="pstr"
SAMPLELIST="filtered_sampleids"
RAWREADSPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/qc_112023'
READSPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/${SAMPLENAME}/assembly/host_removed"
mkdir -p $READSPATH
LISTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/${SAMPLENAME}"
#set step parameters 
GENOME="Ofav"
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/${GENOME}_genome"
INDEX="${GENOME}_DB"

#build a bowtie2 index from a known genome
# **using ofav since pstr is more closely related to ofav than mcav - (was only built for histat so need to redo for bowtie)
#bowtie2-build $INPUTPATH/GCF_002042975.1_ofav_dov_v1_genomic.fna $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do
#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$RAWREADSPATH"/"${SAMPLEID}_R1_001_val_1.fq" -2 "$RAWREADSPATH"/"${SAMPLEID}_R2_001_val_2.fq" -S $READSPATH/"${SAMPLEID}"_mapped_and_unmapped.sam
#convert sam file from bowtie to a bam file for processing
samtools view -bS $READSPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $READSPATH/"${SAMPLEID}"_mapped_and_unmapped.bam
#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $READSPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $READSPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
# sorts the file so both mates are together and then extracts them back as .fastq files
samtools sort -n -m 5G -@ 2 $READSPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $READSPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -@ 8 $READSPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq \
    -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq \
    -0 /dev/null -s /dev/null -n
 if [ $? -eq 0 ]; then
        echo "host removal completed successfully for sample: $SAMPLEID"
    else
        echo "host removal encountered an error for sample: $SAMPLEID"
        exit 1  
    fi
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Host removal: All samples processed successfully."

# 2)remove symbiont and human seqs using fastq screen 
module load bowtie/2.4.5
conda activate fastq_screen
FASTQSCREEN='/home/brooke_sienkiewicz_student_uml_edu/.conda/envs/fastq_screen/share/fastq-screen-0.15.3-0'
OUTPUTDIR='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/${SAMPLENAME}/assembly/final_reads_filtered'

mkdir -p $OUTPUTDIR

while IFS= read -r SAMPLEID; do
$FASTQSCREEN/fastq_screen --nohits --aligner bowtie2 --conf $FASTQSCREEN/fastq_screen.conf --outdir $OUTPUTDIR \
$READSPATH/"${SAMPLEID}"_host_removed_R1.fastq $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq;
 if [ $? -eq 0 ]; then
        echo "fastq_screen completed successfully for sample: $SAMPLEID"
    else
        echo "fastq_screen encountered an error for sample: $SAMPLEID"
        exit 1
    fi
# --nohits = output reads do not map to any genomes
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Symbiont, host removal: All samples processed successfully."

# 3)concatenate all f and r seqs into single file (1 for f, 1 for r)
conda activate assembly
OUTDIR='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/${SAMPLENAME}/assembly'

# Read the sample IDs from the file
while IFS= read -r SAMPLEID; do
    # Construct the file paths for forward and reverse reads
    FORWARD_READ="$READSPATH/${SAMPLEID}_host_removed_R1.fastq"
    REVERSE_READ="$READSPATH/${SAMPLEID}_host_removed_R2.fastq"
    # Check if the files exist before concatenating
    if [ -e "$FORWARD_READ" ]; then
        cat "$FORWARD_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R1_ALL.fastq"
    else
        echo "Forward read file not found for sample $SAMPLEID"
    fi
    if [ -e "$REVERSE_READ" ]; then
        cat "$REVERSE_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R2_ALL.fastq"
    else
        echo "Reverse read file not found for sample $SAMPLEID"
    fi
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate 

# kraken and bracken 
# SKIPPING intermediate steps to go ahead and run kraken 
conda activate kraken

# set variables
DBNAME='//project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/ref_databases/standard'
OUTDIR='//project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/${SAMPLENAME}/taxonomy/kraken_stddb'
THREADS=24
KMER_LEN=35 #(default)
READ_LEN=150

mkdir -p $OUTDIR

# classify each set of paired end reads against silva taxonomic database
while IFS= read -r SAMPLEID; do
kraken2 --db $DBNAME --threads $THREADS --report $OUTDIR/${SAMPLEID}.kreport2 --report-zero-counts --paired $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq > $OUTDIR/${SAMPLEID}.kraken2
if [ $? -eq 0 ]; then
        echo "kraken2 completed successfully for sample: $SAMPLEID"
    else
        echo "kraken2 encountered an error for sample: $SAMPLEID"
        exit 1
    fi
done < "$LISTPATH/${SAMPLELIST}"
echo "Kraken2: All samples processed successfully."

#'report-zero-counts' so that all reports have same taxa so they can be merged

# run bracken
# navigate to Bracken dir because can't figure out how to set up symlink 
cd //project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/Bracken
# Generate bracken database - just do once
#bracken-build -d ${DBNAME} -t ${THREADS} -k ${KMER_LEN} -l ${READ_LEN} 
# Already Done

# Abundance Estimation
FILEPATH='//project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/healthy_2022_mcav/taxonomy/kraken_stddb'
LEVEL="S"

while IFS= read -r line; do
    # Execute bracken command for each file
    bracken -d "$DBNAME" -i $FILEPATH/"$line".kreport2 -o "$FILEPATH/${line%.kreport2}_$LEVEL.bracken" -r "$READ_LEN" -l "$LEVEL"
if [ $? -eq 0 ]; then
        echo "bracken completed successfully for sample: $line"
    else
        echo "bracken encountered an error for sample: $line"
        exit 1
    fi
done < "$LISTPATH/${SAMPLELIST}"
echo "Bracken: All samples processed successfully."
#${SAMPLE}.kreport - the kraken report generated for a given dataset
#{BRACKEN_OUTPUT_FILE}.bracken - the desired name of the output file to be generated by the code
#The following optional parameters may be specified:
    #${LEVEL} - Default = 'S'. This specifies that abundance estimation will calculate estimated reads for each species. Other possible options are K (kingdom level), P (phylum), C (class), O (order), F (family), and G (genus).
    #${THRESHOLD} - Default = 10. For species classification, any species with <= 10 (or otherwise specified) reads will not receive any additional reads from higher taxonomy levels when distributing reads for abundance estimation. 
    #If another classification level is specified, thresholding will occur at that level.


# JOB-ID: 23190073, 23191990
# bash script file name: /work/pi_sarah_gignouxwolfsohn/brooke/${SAMPLENAME}/removaltax

In [None]:
# HAVE NOT RUN YET 

# 4)ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps
megahit --presets meta-large \
-1 "$OUTDIR"/"$SAMPLENAME"_reads_R1_ALL.fastq \
-2 "$OUTDIR"/"$SAMPLENAME"_reads_R2_ALL.fastq \
--keep-tmp-files \
#--continue \
-o $OUTDIR/contigs --out-prefix $SAMPLENAME 
#this one has to make the directory; will fail if it already exists
if [ $? -eq 0 ]; then
        echo "megahit completed successfully for samples: $SAMPLENAME"
    else
        echo "megahit encountered an error for samples: $SAMPLENAME"
        exit 1
    fi
conda deactivate 
echo "Megahit: Contig file created successfully."
#final assembled contigs: ${SAMPLENAME}/assembly/contigs


In [None]:
# this was done previously in 11/2023 with no host removal or anything..delete after redoing process

#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 56:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/bash_scripts/slurm-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate assembly

#set up variables
READSPATH=/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/working/11272023/trimmed
SAMPLENAME=CBC
OUTDIR="$SAMPLENAME"_assembly

#concatenate all forward reads to one fastq file
cat "$READSPATH"/*_R1*.fq > "$READSPATH"/"$SAMPLENAME"_reads_R1_ALL.fastq

#concatenate all reverse reads to one fastq file
cat "$READSPATH"/*_R2*.fq > "$READSPATH"/"$SAMPLENAME"_reads_R2_ALL.fastq

megahit --presets meta-large \
-1 "$READSPATH"/"$SAMPLENAME"_reads_R1_ALL.fastq \
-2 "$READSPATH"/"$SAMPLENAME"_reads_R2_ALL.fastq \
--keep-tmp-files \
-o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/working/11272023/$OUTDIR --out-prefix $SAMPLENAME 
#makes dir or do you need to mkdir beforehand???
#file: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/11_2023bash_scripts/assembly
#JOB ID: 15972157