Skip to content
Permalink
Browse files

Deduplication step added in preprocessing

  • Loading branch information...
rcalandrelli committed Jul 5, 2019
1 parent 121069a commit 3fe0c616f24e8ca2b7de79a81e0c3234a1814432
BIN +0 Bytes (100%) .DS_Store
Binary file not shown.
BIN +0 Bytes (100%) scripts/.DS_Store
Binary file not shown.
@@ -1,4 +1,4 @@
while getopts h:o:1:2:e:g:p:m: option
while getopts h:o:1:2:e:g:p:c: option
do
case "${option}"
in
@@ -9,7 +9,7 @@ o) outputPath=${OPTARG};; # The path where to save the output files.
e) restrictionEnzyme=${OPTARG};; # The restriction enzyme or enzymes passed between square brackets (example: [enzyme1,enzyme2]).
g) genomeIndex=${OPTARG};; # The Bowtie2 genome indexes of the reference genome (only filename without extension).
p) threads=${OPTARG};; # The number of parallel threads to use for alignment and pre-processing. The more the fastest the process.
m) max_lines=${OPTARG};; # The maximum number of lines per each temporary fastq file in order to avoid memory errors. Each temporary file is processed by a separate processor if multiple threads are used.
c) chunk_size=${OPTARG};; # The number of lines per each temporary fastq file in order to avoid memory errors and multiprocessing to speed up the process. Each temporary file is processed by a separate processor if multiple threads are used.
esac
done

@@ -27,9 +27,9 @@ echo -n "Calculating total lines of the fastq files ... "
fastq_lines=`wc -l $fastq1 | awk '{print $1}'`
echo "Done!"

if [ -z $max_lines ]
if [ -z $chunk_size ]
then
echo "max_lines not declared."
echo "chunk_size not declared."
python $hictoolPath"HiCtool_pre_truncation.py" -i [$fastq1,$fastq2] -e $restrictionEnzyme -p $threads

tot_reads_1=$(awk -F'\t' '{sum+=$1;} END{print sum;}' "${fastq1%%.*}_log.txt")
@@ -49,9 +49,9 @@ then
rm "${fastq1%%.*}_log.txt"
rm "${fastq2%%.*}_log.txt"

elif ! [ -z $max_lines ] && [ $max_lines -ge $fastq_lines ]
elif ! [ -z $chunk_size ] && [ $chunk_size -ge $fastq_lines ]
then
echo "max_lines not consider because greater that the total lines of the fastq file."
echo "chunk_size not consider because greater that the total lines of the fastq file."
python $hictoolPath"HiCtool_pre_truncation.py" -i [$fastq1,$fastq2] -e $restrictionEnzyme -p $threads

tot_reads_1=$(awk -F'\t' '{sum+=$1;} END{print sum;}' "${fastq1%%.*}_log.txt")
@@ -71,38 +71,38 @@ then
rm "${fastq1%%.*}_log.txt"
rm "${fastq2%%.*}_log.txt"

elif ! [ -z $max_lines ] && [ $max_lines -lt $fastq_lines ]
elif ! [ -z $chunk_size ] && [ $chunk_size -lt $fastq_lines ]
then
echo -n "Using max_lines to split the fastq files ... "
if (( $max_lines % 4 )) ; then
max_lines=`expr $max_lines - $(($max_lines % 4))`
echo -n "Using chunk_size to split the fastq files ... "
if (( $chunk_size % 4 )) ; then
chunk_size=`expr $chunk_size - $(($chunk_size % 4))`
fi
# Splitting the first fastq file
k=$max_lines
k=$chunk_size
count=1
while [ $k -lt $fastq_lines ]
do
start=`expr $k - $max_lines + 1`
start=`expr $k - $chunk_size + 1`
quit=`expr $k + 1`
sed -n "$start,"$k"p;"$quit"q" $fastq1 > "${fastq1%%.*}_temp_"$count".fastq"
count=`expr $count + 1`
k=`expr $k + $max_lines`
k=`expr $k + $chunk_size`
done
start=`expr $k - $max_lines + 1`
start=`expr $k - $chunk_size + 1`
sed -n "$start,"$fastq_lines"p" $fastq1 > "${fastq1%%.*}_temp_"$count".fastq"

# Splitting the second fastq file
k=$max_lines
k=$chunk_size
count=1
while [ $k -lt $fastq_lines ]
do
start=`expr $k - $max_lines + 1`
start=`expr $k - $chunk_size + 1`
quit=`expr $k + 1`
sed -n "$start,"$k"p;"$quit"q" $fastq2 > "${fastq2%%.*}_temp_"$count".fastq"
count=`expr $count + 1`
k=`expr $k + $max_lines`
k=`expr $k + $chunk_size`
done
start=`expr $k - $max_lines + 1`
start=`expr $k - $chunk_size + 1`
sed -n "$start,"$fastq_lines"p" $fastq2 > "${fastq2%%.*}_temp_"$count".fastq"
echo "Done!"

@@ -153,18 +153,26 @@ echo -n "Aligning "$fastq2_trunc" ... "
echo "Done!"

# extracting the headers and read filtering
echo -n "Filtering HiCfile1.sam ... "
echo "Filtering and deduplicating HiCfile1.sam ... "
samtools view -H HiCfile1.sam > header1.txt
samtools view -F 4 -q 30 HiCfile1.sam > HiCfile1_hq.sam
samtools view -u -h -F 4 -q 30 HiCfile1.sam | \
samtools sort -@ $threads -n - -o - | \
samtools fixmate -m - - | \
samtools sort -@ $threads - -o - | \
samtools markdup - HiCfile1_hq_nodup.bam
echo "Done!"
echo -n "Filtering HiCfile2.sam ... "
echo "Filtering and deduplicating HiCfile2.sam ... "
samtools view -H HiCfile2.sam > header2.txt
samtools view -F 4 -q 30 HiCfile2.sam > HiCfile2_hq.sam
samtools view -u -h -F 4 -q 30 HiCfile2.sam | \
samtools sort -@ $threads -n - -o - | \
samtools fixmate -m - - | \
samtools sort -@ $threads - -o - | \
samtools markdup - HiCfile2_hq_nodup.bam
echo "Done!"

echo -n "Building log files ... "
n1=`wc -l HiCfile1_hq.sam | awk '{print $1}'`
n2=`wc -l HiCfile2_hq.sam | awk '{print $1}'`
n1=`samtools view HiCfile1_hq_nodup.bam | wc -l | awk '{print $1}'`
n2=`samtools view HiCfile2_hq_nodup.bam | wc -l | awk '{print $1}'`

nt1=`wc -l HiCfile1.sam | awk '{print $1}'`
h1=`wc -l header1.txt | awk '{print $1}'`
@@ -177,82 +185,87 @@ ntot2=`expr $nt2 - $h2`
perc1=$(awk -v n1=$n1 -v ntot1=$ntot1 'BEGIN { print 100*n1/ntot1 }' | cut -c1-5)
perc2=$(awk -v n2=$n2 -v ntot2=$ntot2 'BEGIN { print 100*n2/ntot2 }' | cut -c1-5)

printf "\n----------\n"$ntot1" reads; of these:\n "$n1" ("$perc1"%%) aligned with MAPQ>=30" >> HiCfile1_log.txt
printf "\n----------\n"$ntot2" reads; of these:\n "$n2" ("$perc2"%%) aligned with MAPQ>=30" >> HiCfile2_log.txt
printf "\n----------\n"$ntot1" reads; of these:\n "$n1" ("$perc1"%%) aligned with MAPQ>=30 and are deduplicated" >> HiCfile1_log.txt
printf "\n----------\n"$ntot2" reads; of these:\n "$n2" ("$perc2"%%) aligned with MAPQ>=30 and are deduplicated" >> HiCfile2_log.txt
echo "Done!"

rm HiCfile1.sam
rm HiCfile2.sam

echo -n "Selecting paired reads ... "
awk '{print $1}' HiCfile1_hq.sam | sort > readnames1.txt
awk '{print $1}' HiCfile2_hq.sam | sort > readnames2.txt
samtools view HiCfile1_hq_nodup.bam > HiCfile1_hq_nodup.sam
samtools view HiCfile2_hq_nodup.bam > HiCfile2_hq_nodup.sam
rm HiCfile1_hq_nodup.bam
rm HiCfile2_hq_nodup.bam

echo "Selecting paired reads ... "
awk '{print $1}' HiCfile1_hq_nodup.sam | sort > readnames1.txt
awk '{print $1}' HiCfile2_hq_nodup.sam | sort > readnames2.txt
comm -12 readnames1.txt readnames2.txt > paired_reads.txt
echo "Done!"

if [ -z $max_lines ]
if [ -z $chunk_size ]
then
# Select reads of the first sam file that are paires with the second sam file
echo -n "Extracting paired reads from the first sam file and convert it to bam format ..."
grep -Fwf paired_reads.txt HiCfile1_hq.sam | \
grep -Fwf paired_reads.txt HiCfile1_hq_nodup.sam | \
cat header1.txt - | \
samtools view -b -@ $threads - > HiCfile_pair1.bam
rm HiCfile1_hq.sam
rm HiCfile1_hq_nodup.sam
echo "Done!"

# Select reads of the second sam file that are paired with the first sam file
echo -n "Extracting paired reads from the second sam file and convert it to bam format ..."
grep -Fwf paired_reads.txt HiCfile2_hq.sam | \
grep -Fwf paired_reads.txt HiCfile2_hq_nodup.sam | \
cat header2.txt - | \
samtools view -b -@ $threads - > HiCfile_pair2.bam
rm HiCfile2_hq.sam
rm HiCfile2_hq_nodup.sam
echo "Done!"

elif ! [ -z $max_lines ] && [ $max_lines -ge $fastq_lines ]
elif ! [ -z $chunk_size ] && [ $chunk_size -ge $fastq_lines ]
then
# Select reads of the first sam file that are paires with the second sam file
echo -n "Extracting paired reads from the first sam file and convert it to bam format ..."
grep -Fwf paired_reads.txt HiCfile1_hq.sam | \
grep -Fwf paired_reads.txt HiCfile1_hq_nodup.sam | \
cat header1.txt - | \
samtools view -b -@ $threads - > HiCfile_pair1.bam
rm HiCfile1_hq.sam
rm HiCfile1_hq_nodup.sam
echo "Done!"

# Select reads of the second sam file that are paired with the first sam file
echo -n "Extracting paired reads from the second sam file and convert it to bam format ..."
grep -Fwf paired_reads.txt HiCfile2_hq.sam | \
grep -Fwf paired_reads.txt HiCfile2_hq_nodup.sam | \
cat header2.txt - | \
samtools view -b -@ $threads - > HiCfile_pair2.bam
rm HiCfile2_hq.sam
rm HiCfile2_hq_nodup.sam
echo "Done!"

elif ! [ -z $max_lines ] && [ $max_lines -lt $fastq_lines ]
elif ! [ -z $chunk_size ] && [ $chunk_size -lt $fastq_lines ]
then

# Splitting the paired reads file
echo -n "Using max_lines to split the paired read IDs file ..."
echo -n "Using chunk_size to split the paired read IDs file ..."
paired_reads=paired_reads.txt
paired_reads_lines=`wc -l $paired_reads | awk '{print $1}'`
max_lines_paired_reads=`expr $max_lines / 5`
k=$max_lines_paired_reads
chunk_size_paired_reads=`expr $chunk_size / 5`
k=$chunk_size_paired_reads
count=1
while [ $k -lt $paired_reads_lines ]
do
start=`expr $k - $max_lines_paired_reads + 1`
start=`expr $k - $chunk_size_paired_reads + 1`
quit=`expr $k + 1`
sed -n "$start,"$k"p;"$quit"q" $paired_reads > "paired_reads_temp_"$count".txt"
count=`expr $count + 1`
k=`expr $k + $max_lines_paired_reads`
k=`expr $k + $chunk_size_paired_reads`
done
start=`expr $k - $max_lines_paired_reads + 1`
start=`expr $k - $chunk_size_paired_reads + 1`
sed -n "$start,"$paired_reads_lines"p" $paired_reads > "paired_reads_temp_"$count".txt"
echo "Done!"

# Search for paired reads from each temporary file
echo "Extracting paired reads into temporary sam files and merging ..."
for i in "paired_reads_temp"*; do
grep -Fwf $i HiCfile1_hq.sam > "HiCfile1_${i%%.*}.sam"
grep -Fwf $i HiCfile2_hq.sam > "HiCfile2_${i%%.*}.sam"
grep -Fwf $i HiCfile1_hq_nodup.sam > "HiCfile1_${i%%.*}.sam"
grep -Fwf $i HiCfile2_hq_nodup.sam > "HiCfile2_${i%%.*}.sam"
done

cat "HiCfile1_paired_reads_temp"* > HiCfile1_paired.sam
@@ -273,8 +286,8 @@ then
rm HiCfile2_paired.sam
echo "Done!"

rm HiCfile1_hq.sam
rm HiCfile2_hq.sam
rm HiCfile1_hq_nodup.sam
rm HiCfile2_hq_nodup.sam

fi

@@ -53,6 +53,7 @@ bowtie2-build hg38.fa index
1. Pre-truncation of the reads that contain potential ligation junctions to keep the longest piece without a junction sequence ([Ay et al., 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0745-7)).
2. Independent mapping of the read pairs to the reference genome to avoid any proximity constraint.
3. Removing the unmapped reads and selecting reads that were uniquely mapped with a MAPQ >= 30, i.e. the estimated probability of mapping error is <= 0.1%.
4. Deduplicating aligned reads. (Note that PCR duplicates were previously removed in the following section, while now this step has been also added here to allow the extraction of deduplicated data already from the bam or bedpe files).
```unix
# Make the bash script executable
@@ -67,7 +68,7 @@ chmod u+x ./HiCtool-master/scripts/HiCtool_run_preprocessing.sh
-e MboI \
-g /path_to_the_genome_indexes/index \
-p 32 \
-m 50000000
-c 50000000
```
where:

@@ -78,7 +79,7 @@ where:
- ``-e``: the restriction enzyme or enzymes passed between square brackets (example: [MboI,Hinfl] for the cocktail of the Arima Kit).
- ``-g``: Bowtie2 genome indexes. Only the filename should be passed here without extension, in this case ``index``.
- ``-p``: the number of parallel threads (processors) to use for alignment and preprocessing. The more the fastest the process.
- ``-m``: if your data are very big, you may encounter a memory error when the fastq files are loaded for pre-truncated and downstream when the paired reads between the two mapped files are selected. Thus, you may use this parameter in order to split the two fastq files into several temporary files with ``-m`` lines each (this means all the lines, i.e. 4 lines per each read), that are pre-truncated separately. The temporary files will be processed with multiple threads if you set ``-p`` greater than 1. Therefore, setting ``-m`` may help to speed up the pre-truncation process. In addition, setting ``-m`` lets the program work with smaller temporary files at the pairing step as well to generate the output bam files.
- ``-c``: chunk size. If your data are very big, you may encounter a memory error when the fastq files are loaded for pre-truncated and downstream when the paired reads between the two mapped files are selected. Thus, you may use this parameter in order to split the two fastq files into several temporary files with ``-c`` lines each (this means all the lines, i.e. 4 lines per each read), that are pre-truncated separately. The temporary files will be processed with multiple threads if you set ``-p`` greater than 1. Therefore, setting ``-c`` may help to speed up the pre-truncation process. In addition, setting ``-c`` lets the program work with smaller temporary files at the pairing step as well to generate the output bam files.

The structure of the output directory is the following:
```unix
@@ -99,58 +100,55 @@ The structure of the output directory is the following:
```unix
SRR1658570_1.fastq
202095066 reads (length = 101 bp); of these:
29851195 (14.78%) contained a potential ligation junction and have been truncated.
29851195 (14.78%) contained a potential ligation junction and have been truncated.
SRR1658570_2.fastq
202095066 reads (length = 101 bp); of these:
28681691 (14.2%) contained a potential ligation junction and have been truncated.
28681691 (14.2%) contained a potential ligation junction and have been truncated.
```
- ``HiCfile_pair1.bam`` and ``HiCfile_pair2.bam`` that are the bam files of the pre-truncated first and second reads in the pairs respectively, generated after alignment and filtering.
- ``HiCfile1_log.txt`` and ``HiCfile2_log.txt`` are the log files with alignment and filtering statistics for the first and second reads in the pairs respectively.
```unix
HiCfile1_log.txt
202095066 reads; of these:
202095066 (100.00%) were unpaired; of these:
5770798 (2.86%) aligned 0 times
156759009 (77.57%) aligned exactly 1 time
39565259 (19.58%) aligned >1 times
202095066 (100.00%) were unpaired; of these:
5770798 (2.86%) aligned 0 times
156759009 (77.57%) aligned exactly 1 time
39565259 (19.58%) aligned >1 times
97.14% overall alignment rate
----------
202095066 reads; of these:
172973813 (85.59%) aligned with MAPQ>=30; of these:
143415284 (82.91%) were paired and saved into HiCfile_pair1.bam
172969992 (85.58%) aligned with MAPQ>=30 and are deduplicated; of these:
143408614 (82.90%) were paired and saved into HiCfile_pair1.bam
```
```unix
HiCfile2_log.txt
202095066 reads; of these:
202095066 (100.00%) were unpaired; of these:
13381441 (6.62%) aligned 0 times
149852422 (74.15%) aligned exactly 1 time
38861203 (19.23%) aligned >1 times
202095066 (100.00%) were unpaired; of these:
13381441 (6.62%) aligned 0 times
149852422 (74.15%) aligned exactly 1 time
38861203 (19.23%) aligned >1 times
93.38% overall alignment rate
----------
202095066 reads; of these:
161438783 (79.88%) aligned with MAPQ>=30; of these:
143415284 (88.83%) were paired and saved into HiCfile_pair2.bam
161435108 (79.88%) aligned with MAPQ>=30 and are deduplicated; of these:
143408614 (88.83%) were paired and saved into HiCfile_pair2.bam
```

### 1.1. Downloading the raw data from GEO

The source data in sra format are downloaded via GEO accession number using the command ``fastq-dump`` of [SRA Toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=toolkit_doc&f=fastq-dump).

Before proceeding, you may need to [setup the output directory](https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration) where the sra files will be saved. After having installed SRA Toolkit, go to the path where the software has been installed, under the subfolder ``/bin``, and run the following command line:
```unix
./vdb-config -i
```
This will open an interface that will allow you to setup/change your output directory.
The source data in sra format are downloaded via GEO accession number using the command ``fastq-dump`` of [SRA Toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=toolkit_doc&f=fastq-dump) and coverted to fastq format.

To download the data related to a GEO accession number, go to the bottom of that page and click on the SRA number under the section “Relations”. After that, under the section “Runs” you will find the SRR files, then run the following:
```unix
fastq-dump SRRXXXXXXX --split-3
```
where ``SRRXXXXXXX`` has to be replaced with the specific number of the run you want to download (**[SRR1658570](https://www.ncbi.nlm.nih.gov/sra?term=SRX764936) in this documentation**).

To be more specific, this code will either download the SRA file under the output directory you have set with the interface above, but it will also convert the SRA file into fastq files and dump each read into separate files in the current working directory, to produce:
To be more specific, this code will either download the SRA file under the output directory of the SRA Toolkit installation folder (see here to [setup a custom output directory](https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration)), but it will also convert the SRA file into fastq files and dump each read into separate files in the current working directory (the files you will need), to produce:

- ``SRRXXXXXXX_1.fastq``
- ``SRRXXXXXXX_2.fastq``

0 comments on commit 3fe0c61

Please sign in to comment.
You can’t perform that action at this time.