### QC/trimming mcav 04142023


In [None]:
#INSTALLATION
module load miniconda/22.11.1-1
conda create -n qc
conda activate qc
conda install -c bioconda trim-galore

In [None]:
# edit past seq names to remove what the sequencer added
# in 04142023 dir: mcav 

#ex of filenames to edit:
# set2T1_12_2022_MCAV_S73_R1_001.fastq.gz

# working in dir: '/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/04142023'
#copy mcav samples without '.md5' endings
ls *OFAV*.fastq.gz -1 | xargs -I {} cp {} /project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/11272023/

#move subset to new dir 
for file in *OFAV*.fastq.gz; do
    new_name=$(echo "$file" | sed 's/set2//;s/_S[0-9]*//')
    mv "$file" "$new_name"
done

FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/raw'
OUTPUT_RESULTS='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_01222023'

# create a file that gives all the unique filenames in the directory with the data, without the f/r specifiers
ls $FILEPATH -1 | sed 's/_R.*_001.fastq.gz//' | uniq | cat > $OUTPUT_RESULTS/'mcav_sampleids.txt'

#ensure all samples got copied over 
ls $FILEPATH -1 | wc -l # should be double:
wc -l mcav_sampleids.txt 

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/bash_scripts/slurm-mcav_qc-%j.out  # %j = job ID

# Run qc with trim galore and fastqc
module load miniconda/22.11.1-1
conda activate qc

# Define the paths and variables
FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/mcav'
OUTPUT_RESULTS='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024'
NSLOTS=4  

#create filename if not already created
#ls $FILEPATH -1 | sed 's/_R.*_001.fastq.gz//' | uniq | cat > $OUTPUT_RESULTS/'mcav_sampleids.txt'

SAMPLE_NAMES_FILE="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/01312024_redo/mcav_sampleids.txt"

# Check if the file exists
if [ ! -e "$SAMPLE_NAMES_FILE" ]; then
    echo "Error: $SAMPLE_NAMES_FILE does not exist."
    exit 1
fi

# Read each line from the file and perform actions
while IFS= read -r sample_id; do
    # Form the full file names
    input_r1="$FILEPATH/${sample_id}_R1_001.fastq.gz"
    input_r2="$FILEPATH/${sample_id}_R2_001.fastq.gz"
    
    # Ensure the input files exist before running the tools
    if [ ! -e "$input_r1" ] || [ ! -e "$input_r2" ]; then
        echo "Error: Input files do not exist for sample $sample_id"
        continue
    fi

    # Run trim_galore
    trim_galore -j "$NSLOTS" -q 20 --phred33 --length 20 --paired $input_r1 $input_r2 --fastqc -o $OUTPUT_RESULTS --dont_gzip


done < "$SAMPLE_NAMES_FILE"

# bash script file name: mcav_qc
# JOB-ID: 18196048
#trimmed read seqs in folder: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/mcav/trimmed/redo_auto_detect_01312024

### Fastqc info 

https://hbctraining.github.io/Intro-to-rnaseq-hpc-salmon/lessons/qc_fastqc_assessment.html

https://www.bioinformatics.babraham.ac.uk/projects/fastqc/

### Multiqc 
- https://github.com/MultiQC/MultiQC
- summarizes fastqc reports to one html file
- doesn't perform analysis just summarizes already existing reports

In [None]:
# Installation
module load miniconda/22.11.1-1
conda create --name multiqc python=3.11
conda activate multiqc
conda install multiqc -y 

In [None]:
module load miniconda/22.11.1-1
conda activate multiqc 

In [None]:
# in brooke dir
cd mcav/trimmed/01312024_redo
multiqc . 

Tried illumina, nextera, and auto detect for trimming adapters with trim galore and the least amount of adapter contamination came from auto detect and illumina (results were the same), so using auto detect

In [None]:
# calculate read depth?

In [7]:
cd //project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/mcav/

/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/mcav


In [33]:
import os
import pandas as pd

In [10]:
for FILE in *; do echo $(zcat $FILE|wc -l)/4|bc; done > reads

/bin/bash: -c: line 0: syntax error near unexpected token `reads'
/bin/bash: -c: line 0: `for FILE in *; do echo $(zcat $FILE|wc -l)/4|bc; done > !cat reads'


In [21]:
!ls *.gz > samples

In [25]:
!paste samples reads > sample_reads

In [29]:
!head -n -1 sample_reads > sample_reads

In [34]:
cd //project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/11272023/

/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/11272023


In [None]:
for FILE in *; do echo $(zcat $FILE|wc -l)/4|bc; done > reads
ls *.gz > samples
paste samples reads > sample_reads

In [35]:
cd //project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/pilot_copy/

[0m[01;34m11272023[0m/  [01;34mmcav[0m/  [01;34mpilot_copy[0m/


In [None]:
for FILE in *; do echo $(cat $FILE|wc -l)/4|bc; done > reads
ls *.gz > samples
paste samples reads > sample_reads