# EMA Mapping 10xNEW samples *(n = 160)*

In [1]:
#%%bash
module load bioconda samtools sambamba bwa parallel compression-tools/20220329
cd ~/snap_hap/fastq/haplotag_10xNEW_2023/


Lmod is automatically replacing "JupyterPython/2.0" with "bioconda/20220708".



In [2]:
module list


Currently Loaded Modules:
  1) git/2.35.1          4) samtools/1.18    7) parallel/20220222
  2) bioconda/20220708   5) sambamba/0.8.2   8) compression-tools/20220329
  3) htslib/1.18         6) bwa/20220112

 



### Make list of samples

In [21]:
#%%bash

cd ~/snap_hap/fastq/haplotag_10xNEW_2023/Plate9
ls | wc -l
#ls | cut -f1 -d_ | uniq > samples_10xNEW_Plate9.txt
wc -l samples_10xNEW_Plate9.txt

#NB: Manually remove "samples" from the .txt file. 

129
65 samples_10xNEW_Plate9.txt


In [22]:
cd ~/snap_hap/fastq/haplotag_10xNEW_2023/Plate10
ls | wc -l
#ls | cut -f1 -d_ | uniq > samples_10xNEW_Plate10.txt
wc -l samples_10xNEW_Plate10.txt

#NB: Manually remove "samples" from the .txt file. 

192
97 samples_10xNEW_Plate10.txt


# EMA test

### Set variables

In [24]:
WORKDIR='/nfs/scistore18/bartogrp/apal/snap_hap'

## User Inputs
samp_no=1
samp_list=${WORKDIR}/fastq/haplotag_10xNEW_2023/Plate10/samples_10xNEW_Plate10.txt
fastq_dir=${WORKDIR}/fastq/haplotag_10xNEW_2023/Plate10 #absolute directory path to fastq files
batch=10xNEW-Plate10 #sequencing batch, e.g., TRIO/n96/2x/10x/60x/2xE/10xE/60xE
loc=Pla #sampling location
species=Am #sample species
ref_genome=${WORKDIR}/ref_genome/$ref_gen_ver/Amajus_v3.5.fa
ref_gen_ver=v3.5

## Set sequence info
library=Snapdragon-Haplo_${batch} #used in BAM header

In [25]:
## Set EMA base directories (create if necessary)
ema_dir=${WORKDIR}/bams/$ref_gen_ver/ema_dir/$batch
if [ ! -e $ema_dir ]; then mkdir -p $ema_dir; fi
ema_align=${WORKDIR}/bams/$ref_gen_ver/ema_align/$batch
if [ ! -e $ema_align ]; then mkdir -p $ema_align; fi

ls ${WORKDIR}/bams/$ref_gen_ver/ema_dir/

10x		2xE-N703e  2xE-N708e  2x-N703  2x-N708	n96_Ave
10xE		2xE-N704e  2xE-N709e  2x-N704  2x-N709	n96_misc
10xNEW-Plate10	2xE-N705e  2xE-N710e  2x-N705  2x-N710	n96_Pla
2xE-N701e	2xE-N706e  2x-N701    2x-N706  60x	TRIO
2xE-N702e	2xE-N707e  2x-N702    2x-N707  60xE


In [26]:
## read Sample ID from $samp_list
samp_id=$(sed -n "${samp_no}p" ${samp_list})
echo $samp_id

pa1499


In [27]:
## set input variables
read1=$(ls $fastq_dir | grep $samp_id | cut -f1 | head -1) #fastq_R1 read
read2=${read1/_R1/_R2} ##fastq_R2 read
prefix=${batch}-${samp_no}_${species}_${loc}_${samp_id}_${ref_gen_ver} #BAM file prefix
read_group=@RG\\tID:$samp_id\\tSM:${prefix}\\tLB:$library #BAM @RG tag

## Set mapping directory
if [ ! -e "$ema_dir/$prefix" ]; then mkdir -p $ema_dir/$prefix && cd "$_"; else cd $ema_dir/$prefix; fi

In [28]:
## Create symbolic links to fastq reads
ln -s -f ${fastq_dir}${read1} $read1
ln -s -f ${fastq_dir}${read2} $read2

In [34]:
## Check variables
echo Sample list: $samp_list
echo Sample ID: $samp_id
echo Read1: $read1
echo Read2: $read2
echo Prefix: $prefix
echo EMA working directory: $(pwd)
echo final BAM directory: $ema_align

Sample list: /nfs/scistore18/bartogrp/apal/snap_hap/fastq/haplotag_10xNEW_2023/Plate10/samples_10xNEW_Plate10.txt
Sample ID: pa1499
Read1: pa1499_R1_001.cutadapt.fastq.gz
Read2: pa1499_R2_001.cutadapt.fastq.gz
Prefix: 10xNEW-Plate10-1_Am_Pla_pa1499_v3.5
EMA working directory: /nfs/scistore18/bartogrp/apal/snap_hap/bams/v3.5/ema_dir/10xNEW-Plate10/10xNEW-Plate10-1_Am_Pla_pa1499_v3.5
final BAM directory: /nfs/scistore18/bartogrp/apal/snap_hap/bams/v3.5/ema_align/10xNEW-Plate10


### EMA Preprocessing

In [36]:
## Translate haplotag barcodes to 16base barcodes
echo "Translating haplotag barcodes to 16 base barcodes"
zcat $read1 | sed 's/-N...\tRX:/\tRX:/' | 16BaseBCGen ${samp_id} | bgzip -@ 16 > ${read1/.fastq/.16BCgen.fastq}
ln -s -f $fastq_dir/$read2 ${read2/.fastq/.16BCgen.fastq}
cut -f 2 ${samp_id}_HaploTag_to_16BaseBCs | tail -n+2 > ${samp_id}_HaploTag_to_16BaseBCs.ema

Translating haplotag barcodes to 16 base barcodes
[16BaseBCGen Status] :: 16.4 k / 16.4 k reads processed / barcodes added
[16BaseBCGen Status] :: 32.8 k / 32.8 k reads processed / barcodes added
[16BaseBCGen Status] :: 49.2 k / 49.2 k reads processed / barcodes added
[16BaseBCGen Status] :: 65.5 k / 65.5 k reads processed / barcodes added
[16BaseBCGen Status] :: 81.9 k / 81.9 k reads processed / barcodes added
[16BaseBCGen Status] :: 98.3 k / 98.3 k reads processed / barcodes added
[16BaseBCGen Status] :: 114.7 k / 114.7 k reads processed / barcodes added
[16BaseBCGen Status] :: 131.1 k / 131.1 k reads processed / barcodes added
[16BaseBCGen Status] :: 147.5 k / 147.5 k reads processed / barcodes added
[16BaseBCGen Status] :: 163.8 k / 163.8 k reads processed / barcodes added
[16BaseBCGen Status] :: 180.2 k / 180.2 k reads processed / barcodes added
[16BaseBCGen Status] :: 196.6 k / 196.6 k reads processed / barcodes added
[16BaseBCGen Status] :: 213.0 k / 213.0 k reads processed / ba

In [37]:
## No. of barcodes
wc -l pa1499_HaploTag_to_16BaseBCs.ema

380870 pa1499_HaploTag_to_16BaseBCs.ema


In [38]:
## EMA Count
echo "Running EMA count"
paste <(pigz -c -d ${read1/.fastq/.16BCgen.fastq} | paste - - - - | awk '{print $1"\t"$5"\t"$6"\t"$7}') <(pigz -c -d ${read2/.fastq/.16BCgen.fastq} | paste - - - - | awk '{print $1"\t"$5"\t"$6"\t"$7}' ) |\
tr "\t" "\n" |\
ema count -w ${samp_id}_HaploTag_to_16BaseBCs.ema -o $samp_id.16BCgen 2>$samp_id.16BCgen.log
cat $samp_id.16BCgen.log

Running EMA count
:: Loading 10X took 0.1 s
:: Counting took 38.6 s
:: Reads with OK barcode: 12,800,423 out of 13,131,750
:: Ignored 0 reads
:: Dumped block 1
:: Printing took 0.1 s
:: Processed 13,131,750 reads (8,091 MB uncompressed) in 38 s


In [39]:
## EMA Preprocessing
echo "Running EMA preproc"
paste <(pigz -c -d ${read1/.fastq/.16BCgen.fastq} | paste - - - - | awk '{print $1"\t"$5"\t"$6"\t"$7}') <(pigz -c -d ${read2/.fastq/.16BCgen.fastq} | paste - - - - | awk '{print $1"\t"$5"\t"$6"\t"$7}' ) |\
tr "\t" "\n" |\
ema preproc -w ${samp_id}_HaploTag_to_16BaseBCs.ema -n 500 -t 40 -o ${samp_id}_ema-bin ${samp_id}.16BCgen.ema-ncnt 2>&1 |\
tee ${samp_id}_preproc.log

Running EMA preproc
:: Bucketing 1 inputs into 500 files with 40 threads
:: Loading known counts file pa1499.16BCgen.ema-ncnt ... 
:: Loading known counts ... done in 0.1 s
:: Loading full counts file pa1499.16BCgen.ema-fcnt ... 
::: Loading dump 1 of size 380,871 (23 MB) done in 0.027 s
  :: Thread 0 from 0 to 9,522 took 0.002 s
  :: Thread 1 from 9,522 to 19,044 took 0.002 s
  :: Thread 12 from 114,264 to 123,786 took 0.003 s
  :: Thread 2 from 19,044 to 28,566 took 0.004 s
  :: Thread 8 from 76,176 to 85,698 took 0.004 s
  :: Thread 7 from 66,654 to 76,176 took 0.005 s
  :: Thread 24 from 228,528 to 238,050 took 0.001 s
  :: Thread 14 from 133,308 to 142,830 took 0.001 s
  :: Thread 26 from 247,572 to 257,094 took 0.001 s
  :: Thread 32 from 304,704 to 314,226 took 0.008 s
  :: Thread 13 from 123,786 to 133,308 took 0.003 s
  :: Thread 11 from 104,742 to 114,264 took 0.002 s
  :: Thread 28 from 266,616 to 276,138 took 0.003 s
  :: Thread 10 from 95,220 to 104,742 took 0.003 s
  :: T

### EMA Alignment

*EMA alignnment checked only on 10 bins and BWA alignment on reads with bad barcodes*

In [40]:
## EMA Align (sequentially or parallel)
echo "Running EMA on all ema-bin files"
# for file in ${samp_id}_ema-bin/ema-bin-???; 
# do echo "Processing $file"; ema align -t 4 -d -r $ref_genome -R $read_group -p 10x -s $file | samtools sort -@ 4 -O bam -l 0 -m 4G -o ${file}.sorted.bam;
# done
parallel --bar -j10 \
   "ema align -t 4 -d -r $ref_genome -p 10x -s {} | \
   samtools sort -@ 4 -O bam -l 0 -m 4G -o {}.sorted.bam - " ::: ${samp_id}_ema-bin/ema-bin-00?

## BWA align for non-BC reads
echo "Running BWA for non-BC file and sort them"
bwa mem -p -t 40 -M -R $read_group $ref_genome ${samp_id}_ema-bin/ema-nobc | samtools sort -@ 4 -O bam -l 0 -m 4G -o ${samp_id}_ema-bin/ema-nobc.sorted.bam

## Mark duplicates in BWA alignment 
echo "Marking duplicates in BWA alignment"
sambamba markdup -t 40 -p -l 0 ${samp_id}_ema-bin/ema-nobc.sorted.bam ${samp_id}_ema-bin/ema-nobc-pMarkedup.sorted.bam
rm ${samp_id}_ema-bin/ema-nobc.sorted.bam

Running EMA on all ema-bin files
[7m[0m0% 0:10=0s pa1499_ema-bin/ema-bin-009                                           [0mBWA initialization...
[M::bwa_idx_load_from_disk] read 0 ALT contigs
Processing reads...
[bam_sort_core] merging from 0 files and 4 in-memory blocks...
[7m10% 1:9=[0m12s pa1499_ema-bin/ema-bin-009                                          [0mBWA initialization...
[M::bwa_idx_load_from_disk] read 0 ALT contigs
Processing reads...
[bam_sort_core] merging from 0 files and 4 in-memory blocks...
[7m20% 2:8=11s pa14[0m99_ema-bin/ema-bin-009                                          [0mBWA initialization...
[M::bwa_idx_load_from_disk] read 0 ALT contigs
Processing reads...
[bam_sort_core] merging from 0 files and 4 in-memory blocks...
[7m30% 3:7=9s pa1499_ema-bi[0mn/ema-bin-009                                           [0mBWA initialization...
[M::bwa_idx_load_from_disk] read 0 ALT contigs
Processing reads...
[bam_sort_core] merging from 0 files and 4 in-memory b

### EMA Postprocessing

In [41]:
## Merging all bam files
echo "Merging all BAM files"
sambamba merge -t 40 -p $prefix.sorted.bam ${samp_id}_ema-bin/*.bam

## Change barcodes from base-pairs to AxxBxxCxxDxx
echo "Reconverting 16 base barcodes to haplotag barcodes"
samtools view -h $prefix.sorted.bam |\
awk 'BEGIN {split("AAAT,AAAG,AAAC,AATA,AATT,AATG,AATC,AAGA,AAGT,AAGG,AAGC,AACA,AACT,AACG,AACC,ATAA,ATAT,ATAG,ATAC,ATTA,ATTT,ATTG,ATTC,ATGA,ATGT,ATGG,ATGC,ATCA,ATCT,ATCG,ATCC,AGAA,AGAT,AGAG,AGAC,AGTA,AGTT,AGTG,AGTC,AGGA,AGGT,AGGG,AGGC,AGCA,AGCT,AGCG,AGCC,ACAA,ACAT,ACAG,ACAC,ACTA,ACTT,ACTG,ACTC,ACGA,ACGT,ACGG,ACGC,ACCA,ACCT,ACCG,ACCC,TAAA,TAAT,TAAG,TAAC,TATA,TATT,TATG,TATC,TAGA,TAGT,TAGG,TAGC,TACA,TACT,TACG,TACC,TTAA,TTAT,TTAG,TTAC,TTTA,TTTT,TTTG,TTTC,TTGA,TTGT,TTGG,TTGC,TTCA,TTCT,TTCG,TTCC,TGAA",val,","); \
for(i=1;i<=96;i++){lookup[val[i]]=sprintf("%02d",i)}};/BX:Z:/ {match($0,"BX:Z");bx=substr($0,RSTART,23);out="BX:Z:A"lookup[substr(bx,6,4)]"C"lookup[substr(bx,10,4)]"B"lookup[substr(bx,14,4)]"D"lookup[substr(bx,18,4)]substr(bx,22,2);gsub(bx,out,$0);print $0}; !/BX:Z/' |\
samtools view -@ 4 - -O BAM -o $prefix.sorted.BXnum.bam

## Index BAM file
echo "Indexing BAM file"
samtools index -@ 4 $prefix.sorted.BXnum.bam

## Move final BAM and BAI file to output dir
echo "Moving BAM file (sorted, indexed) to final directory: $ema_align"
mv $prefix.sorted.BXnum.bam* $ema_align/

Merging all BAM files

sambamba 0.8.2
 by Artem Tarasov and Pjotr Prins (C) 2012-2021
    LDC 1.27.1 / DMD v2.097.2 / LLVM11.0.0 / bootstrap LDC - the LLVM D compiler (1.27.1)

Reconverting 16 base barcodes to haplotag barcodes
Indexing BAM file
Moving BAM file (sorted, indexed) to final directory: /nfs/scistore18/bartogrp/apal/snap_hap/bams/v3.5/ema_align/10xNEW-Plate10


In [44]:
ls -hl $ema_dir/*

total 1.2G
-rw-r--r-- 1 apal bartogrp   96M Jul  3 12:57 10xNEW-Plate10-1_Am_Pla_pa1499_v3.5.sorted.bam
-rw-r--r-- 1 apal bartogrp 1023K Jul  3 12:57 10xNEW-Plate10-1_Am_Pla_pa1499_v3.5.sorted.bam.bai
-rw-r--r-- 1 apal bartogrp  8.8M Jul  3 12:53 pa1499.16BCgen.ema-fcnt
-rw-r--r-- 1 apal bartogrp  4.4M Jul  3 12:53 pa1499.16BCgen.ema-ncnt
-rw-r--r-- 1 apal bartogrp   227 Jul  3 12:53 pa1499.16BCgen.log
drwxr-xr-x 2 apal bartogrp   515 Jul  3 12:56 pa1499_ema-bin
-rw------- 1 apal bartogrp   11M Jul  3 12:51 pa1499_HaploTag_to_16BaseBCs
-rw-r--r-- 1 apal bartogrp  6.2M Jul  3 12:51 pa1499_HaploTag_to_16BaseBCs.ema
-rw-r--r-- 1 apal bartogrp  3.6K Jul  3 12:54 pa1499_preproc.log
-rw-r--r-- 1 apal bartogrp  1.2G Jul  3 12:51 pa1499_R1_001.cutadapt.16BCgen.fastq.gz
lrwxrwxrwx 1 apal bartogrp   105 Jul  3 12:45 pa1499_R1_001.cutadapt.fastq.gz -> /nfs/scistore18/bartogrp/apal/snap_hap/fastq/haplotag_10xNEW_2023/Plate10/pa1499_R1_001.cutadapt.fastq.gz
lrwxrwxrwx 1 apal bartogrp   105 Jul  3 1

In [45]:
ls -hl $ema_align

total 88M
-rw-r--r-- 1 apal bartogrp  96M Jul  3 12:57 10xNEW-Plate10-1_Am_Pla_pa1499_v3.5.sorted.BXnum.bam
-rw-r--r-- 1 apal bartogrp 624K Jul  3 12:57 10xNEW-Plate10-1_Am_Pla_pa1499_v3.5.sorted.BXnum.bam.bai


# Running EMA on the cluster

In [56]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/bams/v3.5/jobs
ls

10x		2xE-N702e  2xE-N708e  2x-N703  2x-N709	   n96_misc
10xE		2xE-N703e  2xE-N709e  2x-N704  2x-N710	   n96_Pla
10x_merged	2xE-N704e  2xE-N710e  2x-N705  60x	   n96_reheader
10xNEW-Plate10	2xE-N705e  2x_merged  2x-N706  60xE	   TRIO
10xNEW-Plate9	2xE-N706e  2x-N701    2x-N707  60x_merged  TRIO_reheader
2xE-N701e	2xE-N707e  2x-N702    2x-N708  n96_Ave


In [55]:
## Plate9

# if [ ! -d 10xNEW-Plate9 ]; then mkdir -p 10xNEW-Plate9 && cd $_; else cd 10xNEW-Plate9; fi

# sbatch --array=1-64 ~/snap_hap/_scripts/sbatch/readAlign/job-EMA.sbatch \
#     ~/snap_hap/fastq/haplotag_10xNEW_2023/Plate9/samples_10xNEW_Plate9.txt \
#     ~/snap_hap/fastq/haplotag_10xNEW_2023/Plate9 \
#     10xNEW-Plate9 \
#     Pla \
#     Am \
#     ~/snap_hap/ref_genome/v3.5/Amajus_v3.5.fa \
#     v3.5

In [1]:
## Plate10

# if [ ! -d 10xNEW-Plate10 ]; then mkdir -p 10xNEW-Plate10 && cd $_; else cd 10xNEW-Plate10; fi

# sbatch --array=1-96 ~/snap_hap/_scripts/sbatch/readAlign/job-EMA.sbatch \
#     ~/snap_hap/fastq/haplotag_10xNEW_2023/Plate10/samples_10xNEW_Plate10.txt \
#     ~/snap_hap/fastq/haplotag_10xNEW_2023/Plate10 \
#     10xNEW-Plate10 \
#     Pla \
#     Am \
#     ~/snap_hap/ref_genome/v3.5/Amajus_v3.5.fa \
#     v3.5

**NB:** In the *.sbatch* file, make sure to match the no. of requested CPUs `--cpus-per-task` and `parallel -j`.

Sometimes, the SLURM job crashes due to lack of memory space, `#SBATCH --cpus-per-task=10 --mem-per-cpu=4G` is definitely too low. And, `#SBATCH --cpus-per-task=2 --mem-per-cpu=32G` is inefficient. For the current code, it is probably best to use: `#SBATCH --cpus-per-task=5 --mem-per-cpu=32G`
In case of just changing the haplotag barcodes, use `#SBATCH --cpus-per-task=1 --mem-per-cpu=64G`.

**In future, compartmentalise the tasks into separate jobs, e.g., preprocessing, EMA-mapping, postprocessing  for better execution with different computing needs!**