### QC/trimming 11/27/23 sequenced samples
- PSTR, OFAV, OANN

In [None]:
#INSTALLATION
module load miniconda/22.11.1-1
conda create -n qc
conda activate qc
conda install -c bioconda trim-galore

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/11_2023bash_scripts/slurm-%j.out  # %j = job ID

module load miniconda/22.11.1-1
conda activate qc

mkdir 11272023/trimmed/redo_01222023
# Define the paths and variables
FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/11272023'
OUTPUT_RESULTS='11272023/trimmed/redo_01222023' 
NSLOTS=4  

# Create txt file with sampleids
ls $FILEPATH -1 | sed 's/_R.*_001.fastq//' | uniq | cat > $OUTPUT_RESULTS/'11272023_sampleids.txt'
# keep unique sampleids 
#uniq pilot_past/pilot_past.txt > pilot_past/pilot_sampleids.txt

SAMPLE_NAMES_FILE="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/11272023/trimmed/redo_01222023/11272023_sampleids.txt"

# Check if the file exists
if [ ! -e "$SAMPLE_NAMES_FILE" ]; then
    echo "Error: $SAMPLE_NAMES_FILE does not exist."
    exit 1
fi

# Read each line from the file and perform actions
while IFS= read -r sample_id; do
    # Form the full file names
    input_r1="$FILEPATH/${sample_id}R1_001.fastq.gz"
    input_r2="$FILEPATH/${sample_id}R2_001.fastq.gz"
    
    # Ensure the input files exist before running the tools
    if [ ! -e "$input_r1" ] || [ ! -e "$input_r2" ]; then
        echo "Error: Input files do not exist for sample $sample_id"
        continue
    fi

    # Run trim_galore
    trim_galore -j "$NSLOTS" -q 20 --phred33 --illumina --length 20 --paired "$input_r1" "$input_r2" -o $OUTPUT_RESULTS --dont_gzip


done < "$SAMPLE_NAMES_FILE"

# JOB-ID: 
#trimmed read seqs in folder: /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/working/11272023/trimmed

### Misc Samples
- HAVE NOT RUN YET 12/11/23
- PASTS samples in 04/14/23 folder 
    - (supposed to be alongside MCAV from mcav1 dir)
- also grabbing pilot seqs

In [None]:
# edit pilot seq names to remove what the sequencer added

#file names to edit:
#T3-21-Mmea_S96_L001
#T3-21-Mmea_S96_L001
#T3-4-Past_S113_L001
#T3-4-Past_S113_L001

# working in dir: '/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/pilot_copy' 
for file in *; do
    new_name=$(echo "$file" | sed 's/_S.*_L001//' | sed 's/-/_/g')
    mv "$file" "$new_name"
done


In [None]:
# edit past seq names to remove what the sequencer added
# in 04142023 dir

#ex of filenames to edit:
# set2T1_1_2019_PAST_S60_R1_001.fastq.gz
# set2T1_1_2019_PAST_S60_R2_001.fastq.gz

# working in dir: '/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/04142023'
#mcav samples have already been trimmed, and there are repeat samples with '.md5' endings
#so, copy all 'past' files with .fastq.gz extensions to new folder
ls *PAST*.fastq.gz -1 | xargs -I {} cp {} /project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/pilot_copy

#move to dir pilot_copy
for file in *PAST*.fastq.gz; do
    new_name=$(echo "$file" | sed 's/set2//;s/_S[0-9]*//')
    mv "$file" "$new_name"
    gunzip "$new_name"
done

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/bash_scripts/slurm-%j-pilot_past_qc.out  # %j = job ID


# PILOT & PAST SEQS combined into one dir now:

FILEPATH='/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/SCTLD_raw/pilot_copy'
OUTPUT_WORKING='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pilot_past'
OUTPUT_RESULTS='/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pilot_past/trimmed'
NSLOTS=5

module load miniconda/22.11.1-1
conda activate qc 

# this line creates a file that gives all the filenames in the directory with the data, without the suffixes added by the sequencer
#ls $FILEPATH -1 | sed 's/_R.*_001.fastq//' | cat > $OUTPUT_WORKING/'pilot_past.txt'
# keep unique sampleids 
#uniq pilot_past/pilot_past.txt > pilot_past/pilot_sampleids.txt

SAMPLE_NAMES_FILE="/project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/pilot_past/pilot_sampleids.txt"

# Check if the file exists
if [ ! -e "$SAMPLE_NAMES_FILE" ]; then
    echo "Error: $SAMPLE_NAMES_FILE does not exist."
    exit 1
fi

# Read each line from the file and perform actions
while IFS= read -r sample_id || [ -n "$sample_id" ]; do
    # Form the full file names
    input_r1="$FILEPATH/${sample_id}_R1_001.fastq"
    input_r2="$FILEPATH/${sample_id}_R2_001.fastq"
    
    # Ensure the input files exist before running the tools
    if [ ! -e "$input_r1" ] || [ ! -e "$input_r2" ]; then
        echo "Error: Input files do not exist for sample $sample_id"
        continue
    fi

    # Run trim_galore
    trim_galore -j "$NSLOTS" -q 20 --phred33 --illumina --length 20 --paired $input_r1 $input_r2 --fastqc -o $OUTPUT_RESULTS --dont_gzip

    # Run fastqc...already included in script above ?
    fastqc -o "$OUTPUT_RESULTS" -t "$NSLOTS" "$input_r1" "$input_r2"

done < "$SAMPLE_NAMES_FILE"

# JOB-ID: 17115682
#trimmed read seqs in folder: brooke/pilot_past/trimmed

### Fastqc info 

https://hbctraining.github.io/Intro-to-rnaseq-hpc-salmon/lessons/qc_fastqc_assessment.html

https://www.bioinformatics.babraham.ac.uk/projects/fastqc/

fastqc checklist for pilot_past seqs 
T1_1_2019_PAST  - R1 DONE
                - R2 DONE
T1_1_2022_PAST  - R1 DONE
                - R2 DONE
T1_3_MCAV       - R1 DOESNT LOOK GOOD...RETRIM?
- adapter contamination? retry with nextera specification?  
T1_52_2019_PAST
T1_8_2019_PAST
T2_11_2019_PAST
T2_21_2019_PAST
T2_5_MCAV
T3_16_MCAV
T3_21_Mmea
T3_25_2019_PAST
T3_31_2019_PAST
T3_4_Past
T3_6_MCAV
T3_7_2019_PAST

### Misc code

In [None]:
#find unique sample IDs in new raw seqs 11272023
ls | grep -i 'R1_001.fastq.gz' | grep -v '.md5' > //project/pi_sarah_gignouxwolfsohn_uml_edu/brooke/working/11272023/11-23-samples.txt
#ex:
ls $FILEPATH/*.fastq.gz -1 | sed 's/_R*_001.fastq.gz//' |cat > $OUTPUT_WORKING/sample_names.txt

cut -d_ -f 1-4 old_file
This simply means use _ as delimiter, and keep fields 1-4.
set2T3_7_2019_PAST_S61_R2_001.fastq.gz

ls $FILEPATH/*_PAST_*.fastq.gz -1 | sed 's/.*set2\(.*\)_PAST_.*_R.*_001\.fastq.gz/\1/' | sed 's/set2//' > $OUTPUT_WORKING/sample_names.txt

#specifically past samples in 041423 
ls $FILEPATH/*_PAST_*.fastq.gz -1 | sed 's/set2//' | cut -d_ -f 2-5,7 | cat > $OUTPUT_WORKING/sample_names.txt