## Read processing from SOX10 KD library

In [None]:
cd /staging/leuven/stg_00002/lcb/lcb_projects/CSE/SOX10-KD_library/counting

### 01.fastqc

In [None]:
module load FastQC/0.11.8-Java-1.8.0_162
cat CHEQseq_names.txt | while read line
do
HiSeq4000_190222="/staging/leuven/stg_00002/lcb/ngs_runs/HiSeq4000_20190222/Demultiplexed/${line%CHEQ*}*R1*"
fastqc -t 10 ${HiSeq4000_190222} -o 01.fastqc
done

### 02.clean_reads

In [None]:
module use /data/leuven/software/biomed/skylake_centos7/2018a/modules/all/
module load SeqKit/0.10.2
module load cutadapt/1.18-foss-2018a-Python-3.7.0
start=`date +%s`
cat CHEQseq_names.txt | while read line
do
HiSeq4000_190222="/staging/leuven/stg_00002/lcb/ngs_runs/HiSeq4000_20190222/Demultiplexed/${line%CHEQ*}*R1*"
    # Do cutadapt trimming on R1 reads
cutadapt -g GAGCCGAGCAGGCGCGCCGATC...GGAC -e 0.15 -l 17 -m 17 -j 10 --discard-untrimmed \
         -o 02.clean_reads/${line}_R1.fastq.gz \
         ${HiSeq4000_190222}
done
end=`date +%s`
echo $((end-start))

In [None]:
module load fastp/0.20.0-foss-2018a
cat CHEQseq_names.txt | while read line
do
# Filter for Q30 reads
fastp -i 02.clean_reads/${line}_R1.fastq.gz -l 17 -e 30 -h /dev/null/fastp.html -j /dev/null/fastp.json -w 8 -o 02.clean_reads/${line}_q30.fastq.gz
echo -e "Number of reads after Q30 filtering for ${line}: " $(( $(zcat 02.clean_reads/${line}_q30.fastq.gz | wc -l) / 4 ))
done

### 10.Make BC count matrix

In [None]:
# Extract DNA sequences from fastq
cat CHEQseq_names.txt | while read line
do
zcat 02.clean_reads/${line}_q30.fastq.gz | sed -n '2~4p' > 03.sequencing_saturation/${line}_q30.txt
done

In [None]:
# Generate BC count matrix for each samples
parallel \
"cat 03.sequencing_saturation/{}_q30.txt | sort | uniq -c | awk -F \" \" '{print \$2\"\t\"\$1}' \
   > 10.bc_count/{}_count.txt" ::: $(cat CHEQseq_names.txt) 2>/dev/null

# Count number of unique BC per sample
cat CHEQseq_names.txt | while read line
do
echo 10.bc_count/${line}_count.txt && cat 10.bc_count/${line}_count.txt | wc -l
done

In [None]:
# Assign BC to enhancers
EnhBC="/staging/leuven/stg_00002/lcb/lminn/Melanoma-species/CHEQ-seq_synthetic/BC-Analysis_MM087-Miseq-Novaseq/CheqSeq_OLS_Liesbeth_E-BC-couples_Miseq-Novaseq_unduplBCs.2col.matrix"
parallel \
"awk -F '\t' -v OFS='\t' 'FNR==NR{a[\$2]=\$1 FS \$2;next}{ print a[\$1],\$2}' \
   <( cat $EnhBC | awk '{print \$2\"\t\"\$1}') \
   <( cat 10.bc_count/{}_count.txt ) \
   | awk -F '\t' '{if (\$1) print \$0;}' \
   > 10.bc_count/{}_count_final.txt" ::: $(cat CHEQseq_names.txt) 2>/dev/null

In [None]:
cat CHEQseq_names.txt | while read line
do
file="10.bc_count/${line}_count_final.txt"
middleBC=$(($(awk '{print $1}' $file | sort | uniq -c | wc -l)/2))
middleR=$(($(cat $file | wc -l)/2))
echo ${line}
echo "Number of BC associated to an enhancer: " $(cat 10.bc_count/${line}_count_final.txt | wc -l)
echo "Enhancer coverage: " $(awk '{print $1}' $file | sort | uniq -c | wc -l)
echo "Median number of BC per enhancer: " $(awk '{print $1}' $file | sort | uniq -c | awk '{print $1}' | sort -g | sed -n ${middleBC}p)
echo "Median number of read per BC: " $(awk '{print $NF}' $file | sort -g | sed -n ${middleR}p)
echo "Number of Neg_Ctrl BC: " $(grep 'Neg-Control' 10.bc_count/${line}_count_final.txt | wc -l)
done