# process overview
1. run bap
2. read bap output to generate a `__barcode_translate.tsv` file, containing the translation from bap barcode to original barcode
3. use ghuls' `add_new_bc_tag` program to add new, post-bap barcode (= "multiplet merged" barcode)  to the original bam files
4. re-generate the fragments.tsv files from this new bap, using the new "multiplet merged" barcodes.

# generate barcode_translate file

In [1]:
import os
import pandas as pd

In [9]:
samples = {
    'HYA__24010b__20210813_384_PBMC_11_S9': 'hydrop_384',
    'HYA__2beafa__20210813_384_PBMC_12_S10': 'hydrop_384',
    'HYA__3d6da9__20210813_384_PBMC_21_S11': 'hydrop_384',
    'HYA__5028cb__20210813_384_PBMC_22_S12': 'hydrop_384',
}

In [10]:
whitelist = {
        'biorad_1': '/staging/leuven/stg_00002/lcb/cflerin/analysis/pbmc_atac/analysis2/data_freeze_Jun2021/atac_preprocess/multiplet_tagged/unique_barcodes/Broad_1__unique_barcodes.txt',
        'biorad_2': '/staging/leuven/stg_00002/lcb/cflerin/analysis/pbmc_atac/analysis2/data_freeze_Jun2021/atac_preprocess/multiplet_tagged/unique_barcodes/Broad_2__unique_barcodes.txt',
    'standard': '/vsc-hard-mounts/leuven-data/software/biomed/skylake_centos7/2018a/software/CellRangerATAC/2.0.0/lib/python/atac/barcodes/737K-cratac-v1.txt.gz',
    'standard_revcomp': '/staging/leuven/stg_00002/lcb/cflerin/data/public/barcode_whitelists/737K-cratac-v1_revcomp.txt.gz',
    'multiome': '/vsc-hard-mounts/leuven-data/software/biomed/skylake_centos7/2018a/software/CellRangerATAC/2.0.0/lib/python/atac/barcodes/737K-arc-v1.txt.gz',
    'hydrop_384': '/lustre1/project/stg_00002/lcb/fderop/data/00000000_barcodes/20210729_384x384_atac/20210929_HYDROP-ATAC_384x384_RVCOMP.txt'
    }

In [12]:
!ls /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/bap

HYA__24010b__20210813_384_PBMC_11_S9   HYA__3d6da9__20210813_384_PBMC_21_S11
HYA__2beafa__20210813_384_PBMC_12_S10  HYA__5028cb__20210813_384_PBMC_22_S12


In [15]:
# path of bap output
bct_path = '/lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bap'

bam_suffix_new = '.bwa.out.possorted.mm.bam'

In [16]:
if not os.path.exists('barcode_translate'):
    os.makedirs('barcode_translate')

In [17]:
for k,v in samples.items():
    print(f"Starting sample {k}")
    #if(k=='Broad_1'):
    #    break
    wl = pd.read_csv(whitelist[v], header=None, names=['CB'])

    f_bct = os.path.join(bct_path, k, 'final', k + '.barcodeTranslate.tsv')
    bct = pd.read_csv(f_bct, sep='\t', header=None, names=['CB','DB'])
    #bct['CB0'] = bct['CB']

    # drop singlets:
    bct['mult_cnt'] = [ x.split("_")[-1] for x in bct['DB'] ]
    bct = bct[ bct['mult_cnt']!='N01' ]

    # create translation for multiplets
    bctu = bct['DB'].unique()
    tmp_bct = []
    for bc in bctu:
        ix = bct['DB']==bc
        db = '_'.join([ x for x in bct[ix]['CB'] ])
        tmp_bct.extend(
            [ [x,db] for x in bct[ix]['CB'] ]
            )
    tmp_bct2 = pd.DataFrame(tmp_bct, columns=['CB','DB'])

    # create translation for remaining whitelist singlets
    wl_sng = wl[~wl['CB'].isin(bct['CB'])].copy()
    wl_sng['DB'] = wl_sng['CB']

    new_bct = pd.concat([tmp_bct2, wl_sng], axis=0)

    assert len(new_bct['CB'].unique()) == wl.shape[0]

    print("... writing output")
    new_bct.to_csv('barcode_translate/' + k + '__barcode_translate.tsv',
            sep='\t', index=False, header=False)
    

Starting sample HYA__24010b__20210813_384_PBMC_11_S9
... writing output
Starting sample HYA__2beafa__20210813_384_PBMC_12_S10
... writing output
Starting sample HYA__3d6da9__20210813_384_PBMC_21_S11
... writing output
Starting sample HYA__5028cb__20210813_384_PBMC_22_S12
... writing output


# add new bc tag to bam file

In [19]:
import os

prog = '/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit_rust/target/release/add_new_bc_tag'

samples = 'HYA__24010b__20210813_384_PBMC_11_S9   HYA__3d6da9__20210813_384_PBMC_21_S11 HYA__2beafa__20210813_384_PBMC_12_S10  HYA__5028cb__20210813_384_PBMC_22_S12'.split()

old_bc_tag = 'CB'
new_bc_tag = 'DB'

bam_paths = '/lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bam'

bam_suffix = '.bwa.out.possorted.bam'

In [20]:
### BAP outputs:
#bct_path = '/staging/leuven/stg_00002/lcb/cflerin/analysis/pbmc_atac/analysis2/data_freeze_Jun2021/atac_preprocess/out/data/bap/'

### re-processed translations with full set of whitelist barcodes (so that we keep all CB tags from the original bam)

bam_suffix_new = '.bwa.out.possorted.mm.bam'
postbap_bam_dir = 'fragments_bap/bam_postbap'
if not os.path.exists(postbap_bam_dir):
    os.makedirs(postbap_bam_dir)

################################################################################

for s in samples:
    f_bam_in = os.path.join(bam_paths, s + bam_suffix)
    # bap bct format:
    #f_bct = os.path.join(bct_path, s, 'final', s + '.barcodeTranslate.tsv')
    f_bct = os.path.join(bct_path, s + '__barcode_translate.tsv')
    f_bam_out = os.path.join(postbap_bam_dir, s + bam_suffix_new)
    cmd = ' '.join([prog, f_bam_in, f_bam_out, f_bct, old_bc_tag, new_bc_tag])
    cmd2 = ' && samtools index ' + f_bam_out
    print(cmd + cmd2)

/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit_rust/target/release/add_new_bc_tag /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bam/HYA__24010b__20210813_384_PBMC_11_S9.bwa.out.possorted.bam fragments_bap/bam_postbap/HYA__24010b__20210813_384_PBMC_11_S9.bwa.out.possorted.mm.bam /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bap/HYA__24010b__20210813_384_PBMC_11_S9__barcode_translate.tsv CB DB && samtools index fragments_bap/bam_postbap/HYA__24010b__20210813_384_PBMC_11_S9.bwa.out.possorted.mm.bam
/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit_rust/target/release/add_new_bc_tag /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bam/HYA__3d6da9__20210813_384_PBMC_21_S11.bwa.out.possorted.bam fragments_bap/bam_postbap/HYA__3d6da9__20210813_384_PBMC_21_S11.bwa.out.possorted.mm.bam /lustre1/project/

### this output now goes into a file called `run_multiplet_merge.parallel`
make sure everything is present, especially the barcode translate files need to be in right place!

In [None]:
/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit_rust/target/release/add_new_bc_tag /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bam/HYA__24010b__20210813_384_PBMC_11_S9.bwa.out.possorted.bam fragments_bap/bam_postbap/HYA__24010b__20210813_384_PBMC_11_S9.bwa.out.possorted.mm.bam /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bap/HYA__24010b__20210813_384_PBMC_11_S9__barcode_translate.tsv CB DB && samtools index fragments_bap/bam_postbap/HYA__24010b__20210813_384_PBMC_11_S9.bwa.out.possorted.mm.bam
/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit_rust/target/release/add_new_bc_tag /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bam/HYA__3d6da9__20210813_384_PBMC_21_S11.bwa.out.possorted.bam fragments_bap/bam_postbap/HYA__3d6da9__20210813_384_PBMC_21_S11.bwa.out.possorted.mm.bam /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bap/HYA__3d6da9__20210813_384_PBMC_21_S11__barcode_translate.tsv CB DB && samtools index fragments_bap/bam_postbap/HYA__3d6da9__20210813_384_PBMC_21_S11.bwa.out.possorted.mm.bam
/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit_rust/target/release/add_new_bc_tag /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bam/HYA__2beafa__20210813_384_PBMC_12_S10.bwa.out.possorted.bam fragments_bap/bam_postbap/HYA__2beafa__20210813_384_PBMC_12_S10.bwa.out.possorted.mm.bam /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bap/HYA__2beafa__20210813_384_PBMC_12_S10__barcode_translate.tsv CB DB && samtools index fragments_bap/bam_postbap/HYA__2beafa__20210813_384_PBMC_12_S10.bwa.out.possorted.mm.bam
/staging/leuven/stg_00002/lcb/ghuls/software/single_cell_toolkit_rust/target/release/add_new_bc_tag /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bam/HYA__5028cb__20210813_384_PBMC_22_S12.bwa.out.possorted.bam fragments_bap/bam_postbap/HYA__5028cb__20210813_384_PBMC_22_S12.bwa.out.possorted.mm.bam /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/data/bap/HYA__5028cb__20210813_384_PBMC_22_S12__barcode_translate.tsv CB DB && samtools index fragments_bap/bam_postbap/HYA__5028cb__20210813_384_PBMC_22_S12.bwa.out.possorted.mm.bam


run in command line

In [22]:
!module load parallel
!module load SAMtools
!cat fragments_bap/run_multiplet_merge.parallel | parallel -j 4 --progress

/bin/bash: module: command not found
/bin/bash: parallel: command not found
cat: fragments_bap/run_multiplet_merge.parallel: No such file or directory


# now generate fragments from these new bams

In [27]:
import os


# make sure that scratch is relative here!
prog = 'singularity run -W $PWD -B /lustre1,/staging,${VSC_SCRATCH}/tmp:/tmp /staging/leuven/res_00001/software/vsn_containers/vibsinglecellnf-sinto-0.7.3.1.img bash -c '


samples = 'HYA__24010b__20210813_384_PBMC_11_S9   HYA__3d6da9__20210813_384_PBMC_21_S11 HYA__2beafa__20210813_384_PBMC_12_S10  HYA__5028cb__20210813_384_PBMC_22_S12'.split()

bc_tag = 'DB'

bam_paths = '/lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/bam_postbap'
bam_suffix = '.bwa.out.possorted.mm.bam'

bed_suffix = '.sinto.mm.fragments.bed'
frag_suffix = '.sinto.mm.fragments.tsv.gz'

newdir = 'fragments_bap/fragments_mm'
if not os.path.exists(newdir):
    os.makedirs(newdir)

################################################################################

In [28]:
for s in samples:
    f_bam_in = os.path.join(bam_paths, s + bam_suffix)
    f_bed_out = os.path.join(newdir, s + bed_suffix)
    f_frag_out = os.path.join(newdir, s + frag_suffix)
    cmd = (
          f"\"sinto fragments -b {f_bam_in} "
          f"-m 30 "
          f"--barcodetag {bc_tag} "
          f"--min_distance 10 "
          f"--max_distance 5000 "
          f"--chunksize 5000000 "
           "--use_chrom '^(chr|)([0-9]{1,2}|[XY]|[23][LR])$' "
          f"-p 20 "
          f"-f {f_bed_out} && "
          f"LC_ALL=C sort -k 1,1 -k 2,2n -k 3,3n "
          f"{f_bed_out} "
          f"| bgzip -c "
          f"> {f_frag_out}"
          )
    cmd2 = ' && tabix -p bed ' + f_frag_out + '"'
    print(prog + cmd + cmd2)

singularity run -W $PWD -B /lustre1,/staging,${VSC_SCRATCH}/tmp:/tmp /staging/leuven/res_00001/software/vsn_containers/vibsinglecellnf-sinto-0.7.3.1.img bash -c "sinto fragments -b /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap/bam_postbap/HYA__24010b__20210813_384_PBMC_11_S9.bwa.out.possorted.mm.bam -m 30 --barcodetag DB --min_distance 10 --max_distance 5000 --chunksize 5000000 --use_chrom '^(chr|)([0-9]{1,2}|[XY]|[23][LR])$' -p 20 -f fragments_bap/fragments_mm/HYA__24010b__20210813_384_PBMC_11_S9.sinto.mm.fragments.bed && LC_ALL=C sort -k 1,1 -k 2,2n -k 3,3n fragments_bap/fragments_mm/HYA__24010b__20210813_384_PBMC_11_S9.sinto.mm.fragments.bed | bgzip -c > fragments_bap/fragments_mm/HYA__24010b__20210813_384_PBMC_11_S9.sinto.mm.fragments.tsv.gz && tabix -p bed fragments_bap/fragments_mm/HYA__24010b__20210813_384_PBMC_11_S9.sinto.mm.fragments.tsv.gz"
singularity run -W $PWD -B /lustre1,/staging,${VSC_SCRATCH}/tmp:/tmp /staging/leuven/re

### again, put these commands into a text file and run this code in command line

In [26]:
!cat fragments_bap/run_fragments_gen.parallel | parallel -j 4 --progress

/bin/bash: parallel: command not found
cat: write error: Broken pipe


now, from here, we can proceed

# merge hydrop bams and fragment files

We need 1 file for sample 11+12, and 1 file for sample 21+22

In [None]:
!ls /lustre1/project/stg_00002/lcb/fderop/data/20210929_20210813_hydrop-atac_384_pbmc/fragments_bap