Cutadapt 
- using SYM_VAR primers to extract symbiodinium (symbiont)seqs from original raw sequences 
- https://cutadapt.readthedocs.io/en/stable/guide.html#five-prime-adapters

Parameter Notes:
- Used assembled contig file as input file
- SYM_VAR primer seqs from (Hume et al 2018)
- paired primers...if using paired end seqs as input files, would include the reverse complements of both 
- linked primers are apparently only for specific scenarios like barcoding or circulization etc
- output as fasta file since 1) input is fasta format (.fa) and 2) no quality trimming or info 

In [None]:
conda create -n cutadaptenv
conda install -c bioconda cutadapt                                                        
#install cutadapt to new env

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/bash_scripts/slurm-%j.out  # %j = job ID

# Set your input and output files
SAMPLENAME="mcav"
INPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/mcav_assembly3"
OUTPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/ITS2/redo_1_2024"

input_fasta="$SAMPLENAME.contigs.fa"
#output_fasta="$SAMPLENAME_symb.contigs.fasta"
#couldn't get this to work?? just typed it out for the -o parameter in cutadapt script

#load modules
module load miniconda/22.11.1-1
conda activate cutadaptenv

# Set your primer sequences
forward_primer="GAATTGCAGAACTCCGTGAACC"
reverse_primer="CGGGTTCWCTTGTYTGACTTCATGC"

# Verify path and input dir - only need for troubleshooting
echo "Working Directory: $(pwd)"
#ls -l $INPUTDIR

# Run cutadapt
cutadapt \
  -g "$forward_primer" \
  -a "$reverse_primer" \
  --discard-untrimmed \
  --action=retain \
  -o $OUTPUTDIR/"mcav_ITS2.contigs.fasta" \
  $INPUTDIR/"$input_fasta"
  
ls -l $OUTPUTDIR
#check results dir to see if it was successful in creating output file 

#script file name: ITS2_trimming
#JOB ID: 18137275


Loading miniconda version 22.11.1-1
Working Directory: //project/pi_sarah_gignouxwolfsohn_uml_edu/brooke
This is cutadapt 4.4 with Python 3.7.12
Command line parameters: -g GAATTGCAGAACTCCGTGAACC -a CGGGTTCWCTTGTYTGACTTCATGC --discard-untrimmed --action=retain -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/ITS2/redo_1_2024/mcav_ITS2.contigs.fasta /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/assembly/mcav_assembly3/mcav.contigs.fa
Processing single-end reads on 1 core ...
Finished in 6.469 s (7.703 µs/read; 7.79 M reads/minute).

=== Summary ===

Total reads processed:                 839,850
Reads with adapters:                    23,485 (2.8%)

== Read fate breakdown ==
Reads discarded as untrimmed:          816,365 (97.2%)
Reads written (passing filters):        23,485 (2.8%)

Total basepairs processed:   435,829,081 bp
Total written (filtered):     12,156,332 bp (2.8%)

=== Adapter 1 ===

Sequence: GAATTGCAGAACTCCGTGAACC; Type: regular 5'; Length: 22; Trimmed: 13528 times

Minimum overlap: 3
No. of allowed errors:
1-9 bp: 0; 10-19 bp: 1; 20-22 bp: 2

Overview of removed sequences
length	count	expect	max.err	error counts
3	9739	13122.7	0	9739
4	2849	3280.7	0	2849
5	694	820.2	0	694
6	146	205.0	0	146
7	62	51.3	0	62
8	9	12.8	0	9
9	3	3.2	0	0 3
10	14	0.8	1	0 14
11	5	0.2	1	0 5
12	3	0.1	1	0 3
13	1	0.0	1	0 1
24	1	0.0	2	1
119	1	0.0	2	1
2819	1	0.0	2	1


=== Adapter 2 ===

Sequence: CGGGTTCWCTTGTYTGACTTCATGC; Type: regular 3'; Length: 25; Trimmed: 9957 times

Minimum overlap: 3
No. of allowed errors:
1-9 bp: 0; 10-19 bp: 1; 20-25 bp: 2

Bases preceding removed adapters:
  A: 23.2%
  C: 23.8%
  G: 25.1%
  T: 27.9%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
3	7883	13122.7	0	7883
4	1639	3280.7	0	1639
5	258	820.2	0	258
6	65	205.0	0	65
7	31	51.3	0	31
8	11	12.8	0	11
9	20	3.2	0	4 16
10	24	0.8	1	2 22
11	14	0.2	1	1 13
12	6	0.1	1	0 6
13	4	0.0	1	0 4
14	1	0.0	1	0 1
17	1	0.0	1	1
total 4113
-rw-rw-r-- 1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu 13140771 Jan 31 16:49 mcav_ITS2.contigs.fasta

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 16:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/bash_scripts/slurm-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate anvio-8

#set variables
SAMPLENAME=mcav
READSPATH="//project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024"
CONTIGPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/ITS2/redo_1_2024"
CONTIGFILE="$SAMPLENAME"_ITS2.contigs.fasta
IDPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/01312024_redo"
IDFILE="mcav_sampleids.txt"
#No FIXEDCON - Skipping defline fix and contig bp filtering step 

#build an index of your contigs, which only needs to happen once
bowtie2-build $CONTIGPATH/$CONTIGFILE $CONTIGPATH/"$SAMPLENAME"_ITS2_contigs

#align reads to your contigs and collect in a .sam file
LINES=$(cat $IDPATH/$IDFILE)                                                                                                                                                                            
for f in $LINES 
do
bowtie2 --threads 11 -x $CONTIGPATH/"$SAMPLENAME"_ITS2_contigs -1 $READSPATH/"$f"_R1_001_val_1.fq -2 $READSPATH/"$f"_R2_001_val_2.fq -S $CONTIGPATH/"$f".sam

#convert sam files to a bam files 
samtools view -F 4 -b -S $CONTIGPATH/"$f".sam -o $CONTIGPATH/"$f"-RAW.bam

#index and sort bam files using anvio
anvi-init-bam $CONTIGPATH/"$f"-RAW.bam -o $CONTIGPATH/index/"$f".bam

#remove raw bam files to cleanup 
rm $CONTIGPATH/"$f"-RAW.bam

#convert bam files to fastq format for symbio import
samtools bam2fq $CONTIGPATH/index/"$f".bam > $CONTIGPATH/seqs/"$f".fastq

done
#generates BAM files from each sample sequence, aligns, indexes...need output bam for downstream analysis

#bash script: ITS2_mapping
#JOB ID: 18332770