In [1]:
import pandas as pd
import glob
import os
from collections import OrderedDict

In [2]:
filenames = glob.glob('selected_barcodes/*')

In [3]:
filenames = sorted(glob.glob('selected_barcodes/*'))
samples = [item.replace("selected_barcodes/", "") for item in filenames]
samples = [item.replace(".cell_barcodes.txt", "") for item in samples]
bc_dict = {samples[i]: filenames[i] for i in range(len(samples))}
bc_dict

{'Broad_1': 'selected_barcodes/Broad_1.cell_barcodes.txt',
 'Broad_2': 'selected_barcodes/Broad_2.cell_barcodes.txt',
 'Broad_mito_1': 'selected_barcodes/Broad_mito_1.cell_barcodes.txt',
 'Broad_mito_2': 'selected_barcodes/Broad_mito_2.cell_barcodes.txt',
 'CNAG_1': 'selected_barcodes/CNAG_1.cell_barcodes.txt',
 'CNAG_2': 'selected_barcodes/CNAG_2.cell_barcodes.txt',
 'Sanger_1': 'selected_barcodes/Sanger_1.cell_barcodes.txt',
 'Sanger_2': 'selected_barcodes/Sanger_2.cell_barcodes.txt',
 'Stanford_1': 'selected_barcodes/Stanford_1.cell_barcodes.txt',
 'Stanford_2': 'selected_barcodes/Stanford_2.cell_barcodes.txt',
 'VIB_1': 'selected_barcodes/VIB_1.cell_barcodes.txt',
 'VIB_2': 'selected_barcodes/VIB_2.cell_barcodes.txt',
 'VIB_Hydrop_1': 'selected_barcodes/VIB_Hydrop_1.cell_barcodes.txt',
 'VIB_Hydrop_2': 'selected_barcodes/VIB_Hydrop_2.cell_barcodes.txt',
 's3atac': 'selected_barcodes/s3atac.cell_barcodes.txt'}

In [4]:
filenames = sorted(glob.glob('fragments_postbap/*tsv'))
samples = [item.replace("fragments_postbap/", "") for item in filenames]
samples = [item.replace(".sinto.mm.fragments.tsv", "") for item in samples]
frag_dict = {samples[i]: filenames[i] for i in range(len(samples))}
frag_dict

{'Broad_1': 'fragments_postbap/Broad_1.sinto.mm.fragments.tsv',
 'Broad_2': 'fragments_postbap/Broad_2.sinto.mm.fragments.tsv',
 'Broad_mito_1': 'fragments_postbap/Broad_mito_1.sinto.mm.fragments.tsv',
 'Broad_mito_2': 'fragments_postbap/Broad_mito_2.sinto.mm.fragments.tsv',
 'CNAG_1': 'fragments_postbap/CNAG_1.sinto.mm.fragments.tsv',
 'CNAG_2': 'fragments_postbap/CNAG_2.sinto.mm.fragments.tsv',
 'Sanger_1': 'fragments_postbap/Sanger_1.sinto.mm.fragments.tsv',
 'Sanger_2': 'fragments_postbap/Sanger_2.sinto.mm.fragments.tsv',
 'Stanford_1': 'fragments_postbap/Stanford_1.sinto.mm.fragments.tsv',
 'Stanford_2': 'fragments_postbap/Stanford_2.sinto.mm.fragments.tsv',
 'VIB_1': 'fragments_postbap/VIB_1.sinto.mm.fragments.tsv',
 'VIB_2': 'fragments_postbap/VIB_2.sinto.mm.fragments.tsv',
 'VIB_Hydrop_1': 'fragments_postbap/VIB_Hydrop_1.sinto.mm.fragments.tsv',
 'VIB_Hydrop_2': 'fragments_postbap/VIB_Hydrop_2.sinto.mm.fragments.tsv',
 's3atac': 'fragments_postbap/s3atac.sinto.mm.fragments.tsv'

In [5]:
for key in bc_dict.keys():
    print(key)
    df_frags = pd.read_csv(frag_dict[key], sep='\t', header=None, index_col=None)
    df_frags.columns = "chr start end bc count".split()
    df_bc = pd.read_csv(bc_dict[key], header=None, index_col=None)
    df_bc.columns = "bc".split()


    df_frags_incells = df_frags[df_frags['bc'].isin(df_bc['bc'])]
    frac_reads = df_frags_incells['count'].sum()/df_frags['count'].sum()
    frac_fragments = len(df_frags_incells)/len(df_frags)

    print(f'\ttotal reads: {df_frags["count"].sum()}')
    print(f'\tcells pre-scrublet/freemuxlet: {len(df_bc)}')
    print(f'\tmapped reads per cell: {df_frags["count"].sum()/len(df_bc)}')
    print(f'\tfraction mapped fragments in cells: {frac_reads}')
    print(f'\tfraction unique fragments in cells: {frac_fragments}')

Broad_1
	total reads: 153138564
	cells pre-scrublet/freemuxlet: 4147
	mapped reads per cell: 36927.55341210514
	fraction mapped fragments in cells: 0.6369902880896806
	fraction unique fragments in cells: 0.5943036671057925
Broad_2
	total reads: 147719731
	cells pre-scrublet/freemuxlet: 3987
	mapped reads per cell: 37050.34637572109
	fraction mapped fragments in cells: 0.6041049384255919
	fraction unique fragments in cells: 0.5682236376860502
Broad_mito_1
	total reads: 91805815
	cells pre-scrublet/freemuxlet: 3466
	mapped reads per cell: 26487.54039238315
	fraction mapped fragments in cells: 0.9354537618341496
	fraction unique fragments in cells: 0.9209491757864046
Broad_mito_2
	total reads: 87346572
	cells pre-scrublet/freemuxlet: 3284
	mapped reads per cell: 26597.616321559075
	fraction mapped fragments in cells: 0.9436500267005327
	fraction unique fragments in cells: 0.9303183603821784
CNAG_1
	total reads: 99246210
	cells pre-scrublet/freemuxlet: 2659
	mapped reads per cell: 37324.63

# read stast

In [16]:
filenames = sorted(glob.glob('mapping_stats/barcode/*'))
samples = [item.replace("mapping_stats/barcode/", "") for item in filenames]
samples = [item.split('__')[0] for item in samples]

In [17]:
list(set(samples))

['CNAG_1',
 'Broad_1',
 'CNAG_2',
 'VIB_1',
 'Broad_mito_2',
 'Sanger_1',
 'Stanford_1',
 'Broad_2',
 'Sanger_2',
 'Stanford_2',
 'VIB_2',
 'Broad_mito_1']

In [44]:
df.loc['nbr_reads:'].values[0]

36118170

In [None]:
nbr_reads_with_bc1_bc2_bc3_correct_or_correctable

In [65]:
df.loc['nbr_reads_with_bc1_bc2_bc3_correct_or_correctable'].values[0]

KeyError: 'nbr_reads_with_bc1_bc2_bc3_correct_or_correctable'

In [68]:
df.loc['total_bc_found'].values[0]

34152018

In [69]:
for sample in list(set(samples)):
    print(sample)
    filenames = sorted(glob.glob('mapping_stats/barcode/*'+sample+'*'))
    reads_sum = 0
    bc_sum = 0
    for file in filenames:
        bc_found = 0
        reads = 0
        
        df = pd.read_csv(file, sep='\t', header=None, index_col=0, error_bad_lines=False)
        reads = df.loc['nbr_reads:'].values[0]
        print(f"\t{reads}")
        reads_sum = reads_sum + reads
        if 'total_bc_found' in df.index:
            bc_found = df.loc['total_bc_found'].values[0]
        elif 'nbr_reads_with_bc1_bc2_bc3_correct_or_correctable' in df.index:
            bc_found = df.loc['nbr_reads_with_bc1_bc2_bc3_correct_or_correctable'].values[0]
        print(f'\t{bc_found}')

        bc_sum = bc_sum + bc_found

    print(f"\t\t{reads_sum}")
    print(f"\t\t{bc_sum}")
    print(f"\t\t{bc_sum/reads_sum}")

CNAG_1
	55452356
	54651797
	58220049
	56958151
		113672405
		111609948
		0.981856132981439
Broad_1
	48946871
	47351115
	64164346
	62215172
	64167120
	62222964
		177278337
		171789251
		0.9690369049434393
CNAG_2
	55777293
	54960213
	58569976
	57290841
		114347269
		112251054
		0.981667992438018
VIB_1
	2476583
	2396052
	2289599
	2196186
	1823459
	1776684
	1685129
	1628398
	2039571
	1982504
	1883816
	1815301
	2626907
	2552265
	2417022
	2329062
	13932183
	13554573
	13880753
	13560005
	10346947
	10099364
	10365145
	10159643
	11595476
	11318574
	11557249
	11327975
	14878224
	14510652
	14809286
	14503575
		118607349
		115710813
		0.975578781378884
Broad_mito_2
	34042487
	32510266
	35526147
	33719481
	35131252
	33069895
	34532992
	32684346
		139232878
		131983988
		0.9479369377109335
Sanger_1
	30746371
	30372647
	41563265
	41019908
	31683014
	31290084
	28491590
	28125587
		132484240
		130808226
		0.9873493330225542
Stanford_1
	2594855
	2538156
	2539290
	2480014
	2621771
	2564870
	2585450
	2522

b'Skipping line 3: expected 2 fields, saw 3\nSkipping line 4: expected 2 fields, saw 3\nSkipping line 5: expected 2 fields, saw 3\nSkipping line 6: expected 2 fields, saw 3\nSkipping line 7: expected 2 fields, saw 3\nSkipping line 8: expected 2 fields, saw 3\nSkipping line 9: expected 2 fields, saw 3\nSkipping line 10: expected 2 fields, saw 3\nSkipping line 11: expected 2 fields, saw 3\n'
b'Skipping line 3: expected 2 fields, saw 3\nSkipping line 4: expected 2 fields, saw 3\nSkipping line 5: expected 2 fields, saw 3\nSkipping line 6: expected 2 fields, saw 3\nSkipping line 7: expected 2 fields, saw 3\nSkipping line 8: expected 2 fields, saw 3\nSkipping line 9: expected 2 fields, saw 3\nSkipping line 10: expected 2 fields, saw 3\nSkipping line 11: expected 2 fields, saw 3\n'
b'Skipping line 3: expected 2 fields, saw 3\nSkipping line 4: expected 2 fields, saw 3\nSkipping line 5: expected 2 fields, saw 3\nSkipping line 6: expected 2 fields, saw 3\nSkipping line 7: expected 2 fields, saw 

In [58]:
df

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
nbr_reads:,48946871
nbr_reads_with_adapters_needed_to_find_bcs,47801962
nbr_reads_with_bc1_bc2_bc3_correct_or_correctable,47351115


In [53]:
for sample in list(set(samples)):
    print(sample)
    filenames = sorted(glob.glob('mapping_stats/mapping_stats/*'+sample+'*'))
    reads_sum = 0
    for file in filenames:
        df = pd.read_csv(file, sep='\t', header=0, index_col=0, error_bad_lines=False)
        reads = df.loc['Reads mapped with MAPQ>30:'].values[0]/2
        print(f"\t{reads}")

CNAG_1
	102255982.5
Broad_1
	155873103.0
CNAG_2
	102935945.5
VIB_1
	103969731.5
Broad_mito_2
	121610414.0
Sanger_1
	119849433.0
Stanford_1
	27450674.5
Broad_2
	150386249.5
Sanger_2
	145414983.5
Stanford_2
	50068685.0
VIB_2
	272973832.5
Broad_mito_1
	127477001.5


In [48]:
df

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
,CNAG_1
raw total sequences:,227152366
filtered sequences:,0
sequences:,227152366
is sorted:,1
1st fragments:,113576183
last fragments:,113576183
reads mapped:,225848248
reads mapped and paired:,224994436
reads unmapped:,1304118
