# Check the integrity of all bam files generated by pipeline

in bash:
```
samtools quickcheck *.bam && echo 'all ok' || echo 'fail!'
```

In [1]:
import pandas as pd
import glob
import os

In [2]:
bam_list = glob.glob('full_bams/*.bam')
bam_list

['full_bams/EPF_hydrop_2.FULL.possorted.dbfixed.bam',
 'full_bams/VIB_10xv1_1.FULL.possorted.dbfixed.bam',
 'full_bams/CNA_hydrop_3.FULL.possorted.dbfixed.bam',
 'full_bams/MDC_mtscatac_2.FULL.possorted.dbfixed.bam',
 'full_bams/CNA_mtscatac_2.FULL.possorted.dbfixed.bam',
 'full_bams/CNA_10xmultiome_1.FULL.possorted.dbfixed.bam',
 'full_bams/BRO_mtscatac_2.FULL.possorted.dbfixed.bam',
 'full_bams/CNA_10xv11_4.FULL.possorted.dbfixed.bam',
 'full_bams/CNA_10xv2_1.FULL.possorted.dbfixed.bam',
 'full_bams/CNA_ddseq_1.FULL.possorted.dbfixed.bam',
 'full_bams/VIB_10xmultiome_1.FULL.possorted.dbfixed.bam',
 'full_bams/HAR_ddseq_1.FULL.possorted.dbfixed.bam',
 'full_bams/TXG_10xv2_1.FULL.possorted.dbfixed.bam',
 'full_bams/SAN_10xmultiome_2.FULL.possorted.dbfixed.bam',
 'full_bams/VIB_10xmultiome_2.FULL.possorted.dbfixed.bam',
 'full_bams/CNA_10xmultiome_2.FULL.possorted.dbfixed.bam',
 'full_bams/VIB_10xv2_2.FULL.possorted.dbfixed.bam',
 'full_bams/VIB_mtscatac_2.FULL.possorted.dbfixed.bam',
 

In [3]:
bam_list = glob.glob('full_bams/*.bam')
with open('bamcheck.parallel', 'w') as f:
    for bam in bam_list:
        bamcheck_txt = f"{bam}.bamcheck.txt"
        if not os.path.exists(bamcheck_txt):
            f.write(f"samtools flagstat -@ 4 {bam} > {bamcheck_txt}\n")
            print(f"printed {bamcheck_txt} command")
        else:
            print(f"{bamcheck_txt} exists!")

printed full_bams/EPF_hydrop_2.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/VIB_10xv1_1.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/CNA_hydrop_3.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/MDC_mtscatac_2.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/CNA_mtscatac_2.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/CNA_10xmultiome_1.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/BRO_mtscatac_2.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/CNA_10xv11_4.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/CNA_10xv2_1.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/CNA_ddseq_1.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/VIB_10xmultiome_1.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/HAR_ddseq_1.FULL.possorted.dbfixed.bam.bamcheck.txt command
printed full_bams/TXG_10xv2_1.FULL.posso

In [20]:
cat bamcheck.parallel | parallel -j 20 --progress


Computers / CPU cores / Max jobs to run
1:local / 128 / 20

Computer:jobs running/jobs completed/%of started jobs/Average seconds to complete
local:20/4/100%/39.2s 

In [17]:
readlengths_file = '/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/1_data_repository/R1_lengths.sorted.txt'
readlengths = pd.read_csv(readlengths_file, sep='\t', header=None)
readlengths.index = [x.split('/')[-1].split('__')[0] for x in readlengths[0]]
readlengths

Unnamed: 0,0,1
BIO_ddseq_1,full_fastq/BIO_ddseq_1__R1.FULL.fastq.gz,2496461262
BIO_ddseq_2,full_fastq/BIO_ddseq_2__R1.FULL.fastq.gz,586078378
BIO_ddseq_3,full_fastq/BIO_ddseq_3__R1.FULL.fastq.gz,114310431
BIO_ddseq_4,full_fastq/BIO_ddseq_4__R1.FULL.fastq.gz,117075800
BRO_mtscatac_1,full_fastq/BRO_mtscatac_1__R1.FULL.fastq.gz,146150399
BRO_mtscatac_2,full_fastq/BRO_mtscatac_2__R1.FULL.fastq.gz,170806444
CNA_10xmultiome_1,full_fastq/CNA_10xmultiome_1__R1.FULL.fastq.gz,206430093
CNA_10xmultiome_2,full_fastq/CNA_10xmultiome_2__R1.FULL.fastq.gz,194200503
CNA_10xv11_1,full_fastq/CNA_10xv11_1__R1.FULL.fastq.gz,235409552
CNA_10xv11_2,full_fastq/CNA_10xv11_2__R1.FULL.fastq.gz,266533885


In [40]:
stragglers_list = []
files = sorted(glob.glob('full_bams/*.bamcheck.txt'))
for file in files:
    if os.path.getsize(file) > 0:
        sample = file.split('/')[-1].split('__')[0]
        fastq_reads = readlengths[1].loc[sample]

        df = pd.read_csv(file, header=None)
        bam_reads = df.iloc[6][0].split(' ')[0]
        print(f"{sample}: {str(bam_reads)} in bam, {str(fastq_reads)} in FASTQ")
        diff = fastq_reads - int(bam_reads)
        diff_percent = round(diff/fastq_reads*100, 2)
        print(f"\tdiffering {diff}, or {diff_percent}%")
        if diff_percent > 0.5:
            stragglers_list = stragglers_list + [sample]
            print("\t\tStraggler detected!")

BIO_ddseq_2: 568867960 in bam, 586078378 in FASTQ
	differing 17210418, or 2.94%
		Straggler detected!
BIO_ddseq_3: 98061137 in bam, 114310431 in FASTQ
	differing 16249294, or 14.22%
		Straggler detected!
BIO_ddseq_4: 101397826 in bam, 117075800 in FASTQ
	differing 15677974, or 13.39%
		Straggler detected!
BRO_mtscatac_1: 145604100 in bam, 146150399 in FASTQ
	differing 546299, or 0.37%
BRO_mtscatac_2: 170196511 in bam, 170806444 in FASTQ
	differing 609933, or 0.36%
CNA_10xmultiome_1: 206048774 in bam, 206430093 in FASTQ
	differing 381319, or 0.18%
CNA_10xmultiome_2: 193909964 in bam, 194200503 in FASTQ
	differing 290539, or 0.15%
CNA_10xv11_1: 235210366 in bam, 235409552 in FASTQ
	differing 199186, or 0.08%
CNA_10xv11_2: 266303672 in bam, 266533885 in FASTQ
	differing 230213, or 0.09%
CNA_10xv11_3: 205605806 in bam, 205743191 in FASTQ
	differing 137385, or 0.07%
CNA_10xv11_4: 131562498 in bam, 131672325 in FASTQ
	differing 109827, or 0.08%
CNA_10xv11_5: 118879482 in bam, 118964825 in FA

In [19]:
stragglers_list = []
files = sorted(glob.glob('full_bams/*.bamcheck.txt'))
for file in files:
    if os.path.getsize(file) > 0:
        sample = file.split('/')[-1].split('__')[0]
        fastq_reads = readlengths[1].loc[sample]

        df = pd.read_csv(file, header=None)
        bam_reads = df.iloc[6][0].split(' ')[0]
        print(f"{sample}: {str(bam_reads)} in bam, {str(fastq_reads)} in FASTQ")
        diff = fastq_reads - int(bam_reads)
        diff_percent = round(diff/fastq_reads*100, 2)
        print(f"\tdiffering {diff}, or {diff_percent}%")
        if diff_percent > 0.5:
            stragglers_list = stragglers_list + [sample]
            print("\t\tStraggler detected!")

BIO_ddseq_2: 568867960 in bam, 586078378 in FASTQ
	differing 17210418, or 2.94%
		Straggler detected!
BIO_ddseq_3: 98061137 in bam, 114310431 in FASTQ
	differing 16249294, or 14.22%
		Straggler detected!
BIO_ddseq_4: 101397826 in bam, 117075800 in FASTQ
	differing 15677974, or 13.39%
		Straggler detected!
BRO_mtscatac_1: 145604100 in bam, 146150399 in FASTQ
	differing 546299, or 0.37%
BRO_mtscatac_2: 170196511 in bam, 170806444 in FASTQ
	differing 609933, or 0.36%
CNA_10xmultiome_1: 103805718 in bam, 206430093 in FASTQ
	differing 102624375, or 49.71%
		Straggler detected!
CNA_10xmultiome_2: 193909964 in bam, 194200503 in FASTQ
	differing 290539, or 0.15%
CNA_10xv11_1: 235210366 in bam, 235409552 in FASTQ
	differing 199186, or 0.08%
CNA_10xv11_2: 266303672 in bam, 266533885 in FASTQ
	differing 230213, or 0.09%
CNA_10xv11_3: 205605806 in bam, 205743191 in FASTQ
	differing 137385, or 0.07%
CNA_10xv11_4: 131562498 in bam, 131672325 in FASTQ
	differing 109827, or 0.08%
CNA_10xv11_5: 1188794

In [32]:
stragglers_list

['BIO_ddseq_2',
 'BIO_ddseq_3',
 'BIO_ddseq_4',
 'CNA_10xmultiome_1',
 'CNA_ddseq_1',
 'CNA_ddseq_2',
 'CNA_hydrop_61',
 'CNA_hydrop_62',
 'CNA_mtscatac_2',
 'HAR_ddseq_1',
 'HAR_ddseq_2',
 'STA_10xv11_1',
 'STA_10xv11_2',
 'TXG_10xv11_1',
 'VIB_10xmultiome_1',
 'VIB_10xv11_1']

In [33]:
for sample in readlengths.index:
    if os.path.exists(f"full_fragments/{sample}.fragments.raw.tsv.gz"):
        print(f"{sample} has complete fragments")
    else:
        print(f"\t{sample} does not have complete fragments")
        if sample in stragglers_list:
            print(f"\t\t{sample} is in stragglers_list")
        else:
            print(f"\t\t{sample} is not in stragglers_list!!")





	BIO_ddseq_1 does not have complete fragments
		BIO_ddseq_1 is not in stragglers_list!!
BIO_ddseq_2 has complete fragments
BIO_ddseq_3 has complete fragments
BIO_ddseq_4 has complete fragments
	BRO_mtscatac_1 does not have complete fragments
		BRO_mtscatac_1 is not in stragglers_list!!
	BRO_mtscatac_2 does not have complete fragments
		BRO_mtscatac_2 is not in stragglers_list!!
CNA_10xmultiome_1 has complete fragments
	CNA_10xmultiome_2 does not have complete fragments
		CNA_10xmultiome_2 is not in stragglers_list!!
	CNA_10xv11_1 does not have complete fragments
		CNA_10xv11_1 is not in stragglers_list!!
	CNA_10xv11_2 does not have complete fragments
		CNA_10xv11_2 is not in stragglers_list!!
	CNA_10xv11_3 does not have complete fragments
		CNA_10xv11_3 is not in stragglers_list!!
	CNA_10xv11_4 does not have complete fragments
		CNA_10xv11_4 is not in stragglers_list!!
	CNA_10xv11_5 does not have complete fragments
		CNA_10xv11_5 is not in stragglers_list!!
	CNA_10xv2_1 does not have c

Then re-run these through the mapping pipeline.