### QC/trimming pilot_past


In [None]:
#INSTALLATION
module load miniconda/22.11.1-1
conda create -n qc
conda activate qc
conda install -c bioconda trim-galore

In [None]:
# make sample list 

FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/pilot_copy'
OUTPUT_RESULTS='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pilot_past'

# create a file that gives all the unique filenames in the directory with the data, without the f/r specifiers
ls $FILEPATH -1 | sed 's/_R.*_001.fastq//' | uniq | cat > $OUTPUT_RESULTS/'pp_sampleids.txt'

#ensure all samples got copied over 
ls $FILEPATH -1 | wc -l # should be double:
wc -l $OUTPUT_RESULTS/pp_sampleids.txt 

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pilot_past/slurm-qc-%j.out  # %j = job ID

# Run qc with trim galore and fastqc
module load miniconda/22.11.1-1
conda activate qc

# Define the paths and variables
FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/renamed/pilot_copy'
OUTPUT_RESULTS='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pilot_past'
NSLOTS=16

#create filename if not already created
#ls $FILEPATH -1 | sed 's/_R.*_001.fastq.gz//' | uniq | cat > $OUTPUT_RESULTS/'mcav_sampleids.txt'

SAMPLE_NAMES_FILE="${OUTPUT_RESULTS}/pp_sampleids.txt"
mkdir $OUTPUT_RESULTS/trimmed_redo

# Check if the file exists
if [ ! -e "$SAMPLE_NAMES_FILE" ]; then
    echo "Error: $SAMPLE_NAMES_FILE does not exist."
    exit 1
fi

# Read each line from the file and perform actions
while IFS= read -r sample_id; do
    # Form the full file names
    input_r1="$FILEPATH/${sample_id}_R1_001.fastq"
    input_r2="$FILEPATH/${sample_id}_R2_001.fastq"
    
    # Ensure the input files exist before running the tools
    if [ ! -e "$input_r1" ] || [ ! -e "$input_r2" ]; then
        echo "Error: Input files do not exist for sample $sample_id"
        continue
    fi

    # Run trim_galore
    trim_galore -j "$NSLOTS" -q 20 --phred33 --length 20 --paired $input_r1 $input_r2 --fastqc -o $OUTPUT_RESULTS/trimmed_redo --dont_gzip


done < "$SAMPLE_NAMES_FILE"

# run multiqc to view qc results
conda deactivate
conda activate multiqc 

cd $OUTPUT_RESULTS/trimmed_redo
multiqc .

# bash script file name: qc
# JOB-ID: 22262655
#trimmed read seqs in folder: brooke/pilot_past/trimmed_redo

### Fastqc info 

https://hbctraining.github.io/Intro-to-rnaseq-hpc-salmon/lessons/qc_fastqc_assessment.html

https://www.bioinformatics.babraham.ac.uk/projects/fastqc/

### Multiqc 
- https://github.com/MultiQC/MultiQC
- summarizes fastqc reports to one html file
- doesn't perform analysis just summarizes already existing reports

file:///Users/brookesienkiewicz/Downloads/multiqc_report%20(6).html

062019-T3-21-mmea & 062019-t3-4-past
- too long: 300bp instead of 150bp...were likely one of the first sets sequenced differently so EXCLUDING from further analysis
062019-T3-6-MCAV
- non-normally distributed GC content