# Mapping 
- maps the reads onto to the MAG (metagenome-assembled genome) (contigs.fa)
- allows you to quantify genes/taxa in each sample by matching the sequences to the MAG

## Anvio

https://anvio.org/
- used for further analysis 
- Here, we use it for mapping, ...

In [None]:
#create anvio environment
conda create -n anvio-8
#dir=/home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-7.1

Anvio
https://merenlab.org/tutorials/assembly-based-metagenomics/
- reformats fasta file, filters contigs >1000bp
- creates index of contigs ('reference genome') in order to map reads against it 

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-preempt  # Partition
#SBATCH -t 20:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/slurm-mapping-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate anvio-8
#filters contigs and reformats so naming is cleaner ('fixing deflines')
#aligns each sample seq to the contigs ... need output bam for downstream analysis to determine how many of each taxa are in the samples 


SAMPLENAME="diseased_mcav"
READSPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/host_removed'
CONTIGPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly'
CONTIGFILE="$SAMPLENAME"_filtered.contigs.fasta
newCONTIGPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping'
FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav'

#fixes deflines for later and filters on size 
anvi-script-reformat-fasta $CONTIGPATH/$CONTIGFILE -o $newCONTIGPATH/"${SAMPLENAME}_filtered.contigs-fixed.fsa" -l 1000 --simplify-names --report-file contig-rename-report-txt
# filtering seq length 1000bp...need to play around with filtering based on bp length
#deflines = sequence definition line. comes directly before its associated sequence in a fasta file


FIXEDCON="${SAMPLENAME}_filtered.contigs-fixed.fsa"

cd $newCONTIGPATH
#this builds an index of your contigs, which only needs to happen once
bowtie2-build $FIXEDCON "$SAMPLENAME"_contigs
# will not accept path before contigs file - must be in the correct dir 
# or error is just that there's abug so it won't accept .fa extension - renamed to .fsa
ls #debugging

while IFS= read -r SAMPLEID; do
    #align reads to your contigs and collects that in a .sam file
    bowtie2 --threads 11 -x "$SAMPLENAME"_contigs -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq -S index/"${SAMPLEID}".sam
    #note that these reads have host removed but NOT symbionts (which were removed from contigs, so there may be a bit of mismatch there (ie: low alignment rate))
    #make sure to point it to the index not the FIXEDCON file (-x parameter)
    
    #converts your sam file to a bam file, but its neither sorted nor indexed, so we use an Anvi'O script to do so:
    samtools view -F 4 -b -S index/"${SAMPLEID}".sam -o index/"${SAMPLEID}"-RAW.bam
   
    #index and sort your bam file
    anvi-init-bam index/"${SAMPLEID}"-RAW.bam -o index/"${SAMPLEID}".bam
    
    rm index/"${SAMPLEID}"-RAW.bam
done < $FILEPATH/"diseased_MCAV"

#bash script: ~/mcav/diseased_mcav/mapping
#JOB ID: 21753567
#Time elapsed:

### Diseased mcav 

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 20:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/slurm-mapping-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate anvio-8
#filters contigs and reformats so naming is cleaner ('fixing deflines')
#aligns each sample seq to the contigs ... need output bam for downstream analysis to determine how many of each taxa are in the samples 


SAMPLENAME="diseased_mcav"
READSPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly/host_removed'
CONTIGPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/assembly'
CONTIGFILE="$SAMPLENAME"_filtered.contigs.fasta
newCONTIGPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping'
FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav'

#fixes deflines for later and filters on size 
#anvi-script-reformat-fasta $CONTIGPATH/$CONTIGFILE -o $newCONTIGPATH/"${SAMPLENAME}_filtered.contigs-fixed.fsa" -l 1000 --simplify-names --report-file contig-rename-report-txt
# filtering seq length 1000bp...need to play around with filtering based on bp length
#deflines = sequence definition line. comes directly before its associated sequence in a fasta file


FIXEDCON="${SAMPLENAME}_filtered.contigs-fixed.fa"

cd $newCONTIGPATH
mkdir index
#this builds an index of your contigs, which only needs to happen once
bowtie2-build $FIXEDCON "$SAMPLENAME"_contigs
# will not accept path before contigs file - must be in the correct dir 
# or error is just that there's abug so it won't accept .fa extension - renamed to .fsa
ls #debugging

while IFS= read -r SAMPLEID; do
    #align reads to your contigs and collects that in a .sam file
    bowtie2 --threads 11 -x "$SAMPLENAME"_contigs -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq -S index/"${SAMPLEID}".sam
    #note that these reads have host removed but NOT symbionts (which were removed from contigs, so there may be a bit of mismatch there (ie: low alignment rate))
    #make sure to point it to the index not the FIXEDCON file (-x parameter)
    
    #converts your sam file to a bam file, but its neither sorted nor indexed, so we use an Anvi'O script to do so:
    samtools view -F 4 -b -S index/"${SAMPLEID}".sam -o index/"${SAMPLEID}"-RAW.bam
   
    #index and sort your bam file
    anvi-init-bam index/"${SAMPLEID}"-RAW.bam -o index/"${SAMPLEID}".bam
    
    rm index/"${SAMPLEID}"-RAW.bam
done < $FILEPATH/"diseased_MCAV"

#bash script: ~/mcav/diseased_mcav/mapping
#JOB ID: 21753440, 21753947
#Time elapsed:

In [None]:
WHAT WAS THERE
===============================================
Total num contigs ............................: 2,058,317
Total num nucleotides ........................: 1,769,144,061

WHAT WAS ASKED
===============================================
Simplify deflines? ...........................: Yes
Add prefix to sequence names? ................: No
Minimum length of contigs to keep ............: 1,000
Max % gaps allowed ...........................: 100.00%
Max num gaps allowed .........................: 1,000,000
Exclude specific sequences? ..................: No
Keep specific sequences? .....................: No
Enforce sequence type? .......................: No

WHAT HAPPENED
===============================================
Contigs removed ..............................: 1,671,251 (81.20% of all)
Nucleotides removed ..........................: 679,593,620 (38.41% of all)
Nucleotides modified .........................: 0 (0.00000% of all)
Deflines simplified ..........................: True

In [None]:
1837012 reads; of these:
  1837012 (100.00%) were paired; of these:
    1005586 (54.74%) aligned concordantly 0 times
    688840 (37.50%) aligned concordantly exactly 1 time
    142586 (7.76%) aligned concordantly >1 times
    ----
    1005586 pairs aligned concordantly 0 times; of these:
      113638 (11.30%) aligned discordantly 1 time
    ----
    891948 pairs aligned 0 times concordantly or discordantly; of these:
      1783896 mates make up the pairs; of these:
        1564287 (87.69%) aligned 0 times
        156888 (8.79%) aligned exactly 1 time
        62721 (3.52%) aligned >1 times
57.42% overall alignment rate
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libtinfow.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
Sorted BAM File ..............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T1_70_MCAV.bam
BAM File Index ...............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T1_70_MCAV.bam.bai
2419949 reads; of these:
  2419949 (100.00%) were paired; of these:
    1100682 (45.48%) aligned concordantly 0 times
    1065716 (44.04%) aligned concordantly exactly 1 time
    253551 (10.48%) aligned concordantly >1 times
    ----
    1100682 pairs aligned concordantly 0 times; of these:
      101806 (9.25%) aligned discordantly 1 time
    ----
    998876 pairs aligned 0 times concordantly or discordantly; of these:
      1997752 mates make up the pairs; of these:
        1711757 (85.68%) aligned 0 times
        196600 (9.84%) aligned exactly 1 time
        89395 (4.47%) aligned >1 times
64.63% overall alignment rate
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libtinfow.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
Sorted BAM File ..............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T3_49_MCAV.bam
BAM File Index ...............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T3_49_MCAV.bam.bai
2223749 reads; of these:
  2223749 (100.00%) were paired; of these:
    1170497 (52.64%) aligned concordantly 0 times
    869677 (39.11%) aligned concordantly exactly 1 time
    183575 (8.26%) aligned concordantly >1 times
    ----
    1170497 pairs aligned concordantly 0 times; of these:
      268715 (22.96%) aligned discordantly 1 time
    ----
    901782 pairs aligned 0 times concordantly or discordantly; of these:
      1803564 mates make up the pairs; of these:
        1454932 (80.67%) aligned 0 times
        212458 (11.78%) aligned exactly 1 time
        136174 (7.55%) aligned >1 times
67.29% overall alignment rate
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libtinfow.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
Sorted BAM File ..............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T3_51_MCAV.bam
BAM File Index ...............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T3_51_MCAV.bam.bai
75625794 reads; of these:
  75625794 (100.00%) were paired; of these:
    21352018 (28.23%) aligned concordantly 0 times
    30095005 (39.79%) aligned concordantly exactly 1 time
    24178771 (31.97%) aligned concordantly >1 times
    ----
    21352018 pairs aligned concordantly 0 times; of these:
      3825538 (17.92%) aligned discordantly 1 time
    ----
    17526480 pairs aligned 0 times concordantly or discordantly; of these:
      35052960 mates make up the pairs; of these:
        20762952 (59.23%) aligned 0 times
        5197142 (14.83%) aligned exactly 1 time
        9092866 (25.94%) aligned >1 times
86.27% overall alignment rate
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libtinfow.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
Sorted BAM File ..............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T3_60_MCAV.bam
BAM File Index ...............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T3_60_MCAV.bam.bai
19946893 reads; of these:
  19946893 (100.00%) were paired; of these:
    7558356 (37.89%) aligned concordantly 0 times
    9600843 (48.13%) aligned concordantly exactly 1 time
    2787694 (13.98%) aligned concordantly >1 times
    ----
    7558356 pairs aligned concordantly 0 times; of these:
      1962157 (25.96%) aligned discordantly 1 time
    ----
    5596199 pairs aligned 0 times concordantly or discordantly; of these:
      11192398 mates make up the pairs; of these:
        7787485 (69.58%) aligned 0 times
        1804624 (16.12%) aligned exactly 1 time
        1600289 (14.30%) aligned >1 times
80.48% overall alignment rate
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libtinfow.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
samtools: /home/brooke_sienkiewicz_student_uml_edu/.conda/envs/anvio-8/bin/../lib/libncursesw.so.6: no version information available (required by samtools)
Sorted BAM File ..............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T1_13_MCAV.bam
BAM File Index ...............................: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/diseased_mcav/mapping/index/052022_BEL_CBC_T1_13_MCAV.bam.bai