# Process cDNA and genomic DNA MPRA reads for microglia lentiMPRA

In [1]:
mkdir -p {path-to-dir}
cd {path-to-dir}

In [None]:
mkdir -p 00.FastQC 01.cleaned_reads 02.sequencing_saturation 10.bc_count

In [1]:
path_raw={path-to-data}

## 00.FastQC

In [None]:
module load FastQC/0.11.9-Java-11
fastqc -t 10  ${path_raw}MGE*.fastq.gz -o 00.FastQC

## 01.Cleaned Reads

In [8]:
# for i in $(ls ${path_raw}MGE*1_R1_001.fastq.gz)
# do
# file=${i##*/}
# echo $file
# R1=$(echo ${i%_L00*}* | awk '{print $1, $4}')
# cat $R1 > 01.cleaned_reads/${file%_S*_*}_R1_001.fastq.gz
# R2=$(echo ${i%_L00*}* | awk '{print $3, $6}')
# cat $R2 > 01.cleaned_reads/${file%_S*_*}_R3_001.fastq.gz
# done

MGE__6369dc__PCR_2_cDNA_BV2_microglia_75_l_S13_L001_R1_001.fastq.gz
MGE__9b6969__PCR_2_cDNA_BV2_microglia_100_l_S12_L001_R1_001.fastq.gz
MGE__bb4620__PCR_2_cDNA_BV2_microglia_150_l_S11_L001_R1_001.fastq.gz


In [None]:
ls $path_raw

### 01.1.Merge reads

Read 1 and read 2 are overlapping. They both contain the full sequence of the MPRA barcode. We use fastp to merge R1 and R2 to obtain a consensus sequence with higher quality score than each individual read. Further on, we still process the unmerged read 2 sequences to recover some extra barcodes. 

In [None]:
module load fastp/0.23.2-GCC-10.3.0
for i in $(ls ${path_raw}*_R1_001.fastq.gz)
do
file=${i##*/}
fastp -m -h /dev/null/fastp.html -j /dev/null/fastp.json -w 8 \
      --merged_out 01.cleaned_reads/${file%_R*_*}_merged.fastq.gz \
      -o 01.cleaned_reads/${file%_R*_*}_R1_unmerged.fastq.gz -O 01.cleaned_reads/${file%_R*_*}_R2_unmerged.fastq.gz \
      -i $i -I ${i%R1_*}R2_001.fastq.gz
rm 01.cleaned_reads/${file%_R*_*}_R1_unmerged.fastq.gz
done

### 01.2.Cutadapt

Extract MPRA barcodes from merged and unmerged R2 reads

In [None]:
module load cutadapt/4.2
module --ignore-cache load SeqKit/2.4.0

## On merged reads
for i in $(ls 01.cleaned_reads/*_merged.fastq.gz)
do
cutadapt -g TGCCTACGGACCGGCGCGCCGATC...TGTCTGCGAGGGCCAGC -e 0.10 -l 17 -m 17 -j 10 --discard-untrimmed \
         -o ${i%%.fastq.*}_trimmed.fastq.gz \
         $i
rm $i
done

## On R2 reads
for i in $(ls 01.cleaned_reads/*R2_unmerged.fastq.gz)
do
file=${i##*/}
cutadapt -g CTGGCCCTCGCAGACA...GATCGGCGCGCCGGTCC -e 0.15 -l 17 -m 17 -j 10 --discard-untrimmed \
         $i | \
         seqkit seq -r -p -o 01.cleaned_reads/${file%%.*}_trimmed.fastq.gz
rm $i
done

### 01.3.Q30 filtering

Filtering out reads with average phred score < 30

In [None]:
module load fastp/0.23.2-GCC-10.3.0
for i in $(ls 01.cleaned_reads/*_trimmed.fastq.gz)
do
file=${i##*/}
fastp -e 30 -h /dev/null/fastp.html -j /dev/null/fastp.json -w 8 \
      -o 01.cleaned_reads/${file%%.*}_q30.fastq.gz \
      -i $i
rm $i
done

Combine BCs from merged and unmerged fastq

In [44]:
for i in $(ls 01.cleaned_reads/*_merged_trimmed_q30.fastq.gz)
do
 cat ${i%_merged_*}*merged_trimmed_q30.fastq.gz > ${i%_merged_*}_combined.fastq.gz
done

### 01.4.Print states

In [None]:
module load mawk/1.3.4-20230525-GCCcore-10.3.0
> stats.txt
for i in $(ls 01.cleaned_reads/*_combined.fastq.gz)
do 
file=${i#*/}
zcat $i | \
mawk -v 'OFS=\t' -v filename="${file%_combined*}" '
{
    # Only look at each sequence line
    if (NR%4==2) {
        read = $1
        # Add 1 to current read counter.
        counts[read] += 1
        # Count total number of reads seen so far.
        total_reads += 1
    }
}

END {
    for (read in counts) {
        if (counts[read] > max) {
            max = counts[read]
            max_readname = read
        }
    }
    unique_reads = length(counts)
    print filename, total_reads, unique_reads, (unique_reads * 100 / total_reads) "%", max_readname, max, (max * 100 / total_reads) "%"
}
' \
>> stats.txt
echo ${file%_combined*} "done"
done

In [None]:
awk 'BEGIN {printf "Sample\t#_reads\t#_unique_reads\t%_unique_reads\tTop_sequence\t#_reads_top_sequence\t%_top_sequence\n"} {print}'  stats.txt | column -t

## 10. Make BC count matrix

Extract DNA sequences from fastq

In [50]:
for i in $(ls 01.cleaned_reads/*_combined.fastq.gz)
do 
file=${i#*/}
zcat $i | sed -n '2~4p' > 10.bc_count/${file%%.*}.txt
done

Generate BC count matrix

In [None]:
parallel \
"cat 10.bc_count/{}_combined.txt | sort | uniq -c | awk -F \" \" '{print \$2\"\t\"\$1}' \
   > 10.bc_count/{}_count.txt" ::: $(ls 10.bc_count/*_combined.txt | awk -F"/|_combined" '{print $2}') 2>/dev/null

# Count number of unique BC per sample
for i in $(ls 10.bc_count/*_count.txt)
do
echo $i && cat $i | wc -l
done

Assign BC to enhancers

In [52]:
# Assign BC to enhancers
EnhBC={path-to-enh-BC-assignment}
parallel \
"awk -F '\t' -v OFS='\t' 'FNR==NR{a[\$2]=\$1 FS \$2;next}{ print a[\$1],\$2}' \
   <( zcat $EnhBC ) \
   <( cat 10.bc_count/{}_count.txt ) \
   | awk -F '\t' '{if (\$1) print \$0;}' \
   > 10.bc_count/{}_count_final.txt" ::: $(ls 10.bc_count/*_combined.txt | awk -F"/|_combined" '{print $2}') 2>/dev/null

Print some stats

In [None]:
for i in $(ls 10.bc_count/*_count_final.txt)
do 
file=${i#*/}
middleBC=$(($(awk '{print $1}' $i | sort | uniq -c | wc -l)/2))
middleR=$(($(cat $i | wc -l)/2))
echo ${file}
echo "Number of BC associated to an enhancer: " $(cat $i | wc -l)
echo "Enhancer coverage: " $(awk '{print $1}' $i | sort | uniq -c | wc -l)
echo "Median number of BCs per enhancer: " $(awk '{print $1}' $i | sort | uniq -c | awk '{print $1}' | sort -g | sed -n ${middleBC}p)
echo "Median number of reads per BC: " $(awk '{print $NF}' $i | sort -g | sed -n ${middleR}p)
echo "Number of Neg_Ctrl BC: " $(grep 'Shuffle' $i | wc -l)
done

## Sequencing saturation

In [None]:
export LC_ALL=C
for file in 10.bc_count/*_combined.txt
do
file_name=`basename ${file}`
file_tag=${file_name%.txt}
echo ${file_name}
nreads=$(cat $file | wc -l)
if test $nreads -gt 10000000
then
        increment=$(($nreads / 50))
else
        increment=200000
fi
    for i in $(seq 0 $increment $nreads)
    do
    # Print number of total and unique reads for every subsamples
    echo "echo $i \$(shuf -n $(($i)) $file | sort -u | wc -l) >> 02.sequencing_saturation/${file_tag}_sat.txt" \
    >> 02.sequencing_saturation/${file_tag}_parallel.txt
    done
cat 02.sequencing_saturation/${file_tag}_parallel.txt | parallel -j 16
# Put values in numerical order
cat  02.sequencing_saturation/${file_tag}_sat.txt | sort -g > 02.sequencing_saturation/${file_tag}_satsort.txt && mv 02.sequencing_saturation/${file_tag}_satsort.txt 02.sequencing_saturation/${file_tag}_sat.txt
done

In [None]:
module load Python/3.7.4-GCCcore-6.4.0
path_scripts={path-to-script}
nfiles=$(ls 02.sequencing_saturation/*_sat.txt | wc -l)
nrows=$(( ($nfiles + 2) / 3 ))
${path_scripts}plot_saturation.py -ssd 02.sequencing_saturation -r _combined_sat.txt --ncol 3 --nrow $nrows