Cutadapt 
- using SYM_VAR primers to extract symbiodinium (symbiont)seqs from original raw sequences 
- https://cutadapt.readthedocs.io/en/stable/guide.html#five-prime-adapters

Parameter Notes:
- Used assembled contig file as input file
- SYM_VAR primer seqs from (Hume et al 2018)
- paired primers...if using paired end seqs as input files, would include the reverse complements of both 
- linked primers are apparently only for specific scenarios like barcoding or circulization etc
- output as fasta file since 1) input is fasta format (.fa) and 2) no quality trimming or info 

In [None]:
conda create -n cutadaptenv
conda install -c bioconda cutadapt                                                        
#install cutadapt to new env

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/bash_scripts/slurm-%j.out  # %j = job ID

# Set your input and output files
SAMPLENAME="mcav"
INPUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_01222023"

input_fasta="$SAMPLENAME.contigs.fa"
#output_fasta="$SAMPLENAME_symb.contigs.fasta"
#couldn't get this to work?? just typed it out for the -o parameter in cutadapt script

#load modules
module load miniconda/22.11.1-1
conda activate cutadaptenv

# Set your primer sequences
forward_primer="GAATTGCAGAACTCCGTGAACC"
reverse_primer="CGGGTTCWCTTGTYTGACTTCATGC"

# Verify path and input dir - only need for troubleshooting
echo "Working Directory: $(pwd)"
ls -l $INPUTDIR


filename='mcav_sampleids.txt'
while read id; do 
    echo "$id"
done < "$filename"

while IFS= read -r sample_id; do
        f_seq="$f_R1_001_val_1.fq"
        r_seq="$f_R2_001_val_2.fq"
        print(f_seq)
        print(r_seq)

# Run cutadapt
cd $INPUTDIR
filename='mcav_sampleids.txt'
while read id; do 
    f_seq="$f_R1_001_val_1.fq"
    r_seq="$f_R2_001_val_2.fq"
    cutadapt \
      -g "$forward_primer" \
      -a "$reverse_primer" \
      --discard-untrimmed \
      --action=retain \
      -o mcav/ITS2/redo_1_2024/"mcav_ITS2.contigs.fasta" \
      f_seq r_seq
done < "$filename"

ls -l mcav/ITS2/redo_1_2024
#check results dir to see if it was successful in creating output file 

#JOB ID: 17954883


Loading miniconda version 22.11.1-1
Working Directory: //project/pi_sarah_gignouxwolfsohn_uml_edu/brooke
total 344753
-rw-rw-r--  1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu       350 Aug 24 00:57 checkpoints.txt
-rw-rw-r--  1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu         0 Aug 24 00:57 done
drwxrwsr-x  2 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu       110 Aug 24 00:57 intermediate_contigs
-rw-rw-r--  1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu 471039912 Aug 24 00:57 mcav.contigs.fa
-rw-rw-r--  1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu  57767732 Nov  3 15:16 mcav.contigs-fixed.fa
-rw-rw-r--  1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu    207063 Aug 24 00:57 mcav.log
-rw-rw-r--  1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu      1066 Aug 23 22:03 options.json
drwxr-sr-x  6 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu        17 Oct 31 18:06 quast_output
drwxrwsr-x 13 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu        16 Aug 24 00:54 tmp
This is cutadapt 2.6 with Python 3.7.16
Command line parameters: -g GAATTGCAGAACTCCGTGAACC -a CGGGTTCWCTTGTYTGACTTCATGC --discard-untrimmed -o working/trimmed_ITS2/mcav_ITS2.contigs.fasta results/mcav_assembly3/mcav.contigs.fa
Processing reads on 1 core in single-end mode ...
Finished in 36.17 s (43 us/read; 1.39 M reads/minute).

=== Summary ===

Total reads processed:                 839,850
Reads with adapters:                    23,485 (2.8%)
Reads written (passing filters):        23,485 (2.8%)

Total basepairs processed:   435,829,081 bp
Total written (filtered):     12,077,612 bp (2.8%)

=== Adapter 1 ===

Sequence: GAATTGCAGAACTCCGTGAACC; Type: regular 5'; Length: 22; Trimmed: 13528 times.

No. of allowed errors:
0-9 bp: 0; 10-19 bp: 1; 20-22 bp: 2

Overview of removed sequences
length	count	expect	max.err	error counts
3	9739	13122.7	0	9739
4	2849	3280.7	0	2849
5	694	820.2	0	694
6	146	205.0	0	146
7	62	51.3	0	62
8	9	12.8	0	9
9	4	3.2	0	0 4
10	13	0.8	1	0 13
11	5	0.2	1	0 5
12	3	0.1	1	0 3
13	1	0.0	1	0 1
24	1	0.0	2	1
119	1	0.0	2	1
2819	1	0.0	2	1


=== Adapter 2 ===

Sequence: CGGGTTCWCTTGTYTGACTTCATGC; Type: regular 3'; Length: 25; Trimmed: 9957 times.

No. of allowed errors:
0-9 bp: 0; 10-19 bp: 1; 20-25 bp: 2

Bases preceding removed adapters:
  A: 23.2%
  C: 23.8%
  G: 25.1%
  T: 27.9%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
3	7883	13122.7	0	7883
4	1639	3280.7	0	1639
5	258	820.2	0	258
6	65	205.0	0	65
7	31	51.3	0	31
8	11	12.8	0	11
9	20	3.2	0	4 16
10	24	0.8	1	2 22
11	14	0.2	1	1 13
12	6	0.1	1	0 6
13	4	0.0	1	0 4
14	1	0.0	1	0 1
17	1	0.0	1	1
total 6843
-rw-rw-r-- 1 brooke_sienkiewicz_student_uml_edu pi_sarah_gignouxwolfsohn_uml_edu 13062051 Nov 15 16:37 mcav_ITS2.contigs.fasta

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 20:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate anvio-7.1

#set variables
SAMPLENAME=mcav
READSPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_01222023'
CONTIGPATH='//project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/ITS2/redo_1_2024'
CONTIGFILE='mcav_ITS2.contigs.fasta'

#No FIXEDCON - Skipping defline fix and contig bp filtering step 

#build an index of your contigs, which only needs to happen once
bowtie2-build $CONTIGPATH/$CONTIGFILE $CONTIGPATH/"$SAMPLENAME"_ITS2_contigs

#align reads to your contigs and collect in a .sam file
for f in T1_12_2022 T1_13_2022 T1_16_2019 T1_20_2019 T1_24_2019 T1_40_2022 T1_57_2022 T1_70_2022 T2_10_2022 T2_16_2019 T3_13_2022 T3_14_2019 T3_15_2019 T3_19_2022 T3_1_2019 T3_40_2022 T3_48_2022 T3_49_2022 T3_51_2022 T3_60_2022 T3_8_2019 T3_9_2019
do
bowtie2 --threads 11 -x ./working/ITS2/"$SAMPLENAME"_ITS2_contigs -1 $READSPATH/"$f"_MCAV_R1_001_val_1.fq -2 $READSPATH/"$f"_MCAV_R2_001_val_2.fq -S ./working/ITS2/"$f".sam

#convert sam files to a bam files 
samtools view -F 4 -b -S ./working/ITS2/"$f".sam -o ./working/ITS2/"$f"-RAW.bam

#index and sort bam files using anvio
anvi-init-bam ./working/ITS2/"$f"-RAW.bam -o ./results/ITS2/index/"$f".bam

#remove raw bam files to cleanup 
rm ./working/ITS2/"$f"-RAW.bam

#convert bam files to fastq format for symbio import
samtools bam2fq results/ITS2/index/"$f".bam > results/ITS2/seqs/"$f".fastq

done
#generates BAM files from each sample sequence, aligns, indexes...need output bam for downstream analysis

#bash script: ITS2_mapping
#JOB ID: 14027561