# Import fragments into printer object

In [1]:
import os
import json
import time

import anndata
import scprinter as scp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
zip_folder = "../../../../../../data/hydrop/mcortex/paper_zips"
cistopic_fragments_hydrop_anndata_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/cistopic_fragments_mouse_hydropv2_anndata.h5ad" # barcodes and annotations
cistopic_fragments_10x_anndata_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/cistopic_fragments_mouse_10x_anndata.h5ad" # barcodes and annotations
celltype_label = "cleaned_up_annot"  # annotation column name
printer_hydrop_object_output_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/printer_mouse_hydropv2.h5ad"
printer_10x_object_output_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/printer_mouse_10x.h5ad"
fa_file_path = "../../../../../../../../../res_00001/genomes/10xgenomics/CellRangerARC/refdata-cellranger-arc-mm10-2020-A-2.0.0/fasta/genome.fa"
precomputed_bias_path = "../../../../../../data/PRINT/biases/mm10Tn5Bias.h5" # download from scprinter Zenodo or create one with scp.genome.predict_genome_tn5_bias

In [4]:
genome = scp.genome.mm10

## Hydropv2

In [5]:
rel_path = "../../../../../../.."
fragments_hydrop_dict = {
 'HYA__76b206__20240430_Mouse_Cortex-a-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__76b206__20240430_Mouse_Cortex-a-ATAC.fragments.tsv.gz',
 'HYA__ba8433__20231116_HyATAC_MouseCortex_B2': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__ba8433__20231116_HyATAC_MouseCortex_B2.fragments.tsv.gz',
 'HYA__177788__20240117_Mouse_Cortex-d-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__177788__20240117_Mouse_Cortex-d-ATAC.fragments.tsv.gz',
 'HYA__67df33__20240105_Mouse_Cortex-d-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__67df33__20240105_Mouse_Cortex-d-ATAC.fragments.tsv.gz',
 'HYA__b606e2__20240117_Mouse_Cortex-a-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__b606e2__20240117_Mouse_Cortex-a-ATAC.fragments.tsv.gz',
 'HYA__e1bdab__20230829_mouse_cortex_2': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__e1bdab__20230829_mouse_cortex_2.fragments.tsv.gz',
 'HYA__955707__20240304_Mouse_Cortex-b2-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__955707__20240304_Mouse_Cortex-b2-ATAC.fragments.tsv.gz',
 'HYA__a52ea6__20231031_HyDropAtac_MouseCortex_C1': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__a52ea6__20231031_HyDropAtac_MouseCortex_C1.fragments.tsv.gz',
 'HYA__7ad445__20231031_HyDropAtac_MouseCortex_A1': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__7ad445__20231031_HyDropAtac_MouseCortex_A1.fragments.tsv.gz',
 'HYA__853a40__20240304_Mouse_Cortex-b1-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__853a40__20240304_Mouse_Cortex-b1-ATAC.fragments.tsv.gz',
 'HYA__fcb55f__20231013_HyDropATAC_MouseCortex_B': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__fcb55f__20231013_HyDropATAC_MouseCortex_B.fragments.tsv.gz',
 'HYA__ac683b__20230829_mouse_cortex_1': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__ac683b__20230829_mouse_cortex_1.fragments.tsv.gz',
 'HYA__14fb45__20231013_HyDropATAC_MouseCortex_D': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__14fb45__20231013_HyDropATAC_MouseCortex_D.fragments.tsv.gz',
 'HYA__30fc7f__20231031_HyDropAtac_MouseCortex_D1': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__30fc7f__20231031_HyDropAtac_MouseCortex_D1.fragments.tsv.gz',
 'HYA__3eb2c7__20240221_Mouse_Cortex-b-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__3eb2c7__20240221_Mouse_Cortex-b-ATAC.fragments.tsv.gz',
 'HYA__831196__20240105_Mouse_Cortex-a-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__831196__20240105_Mouse_Cortex-a-ATAC.fragments.tsv.gz',
 'HYA__8d1d23__20231116_HyATAC_MouseCortex_C1': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__8d1d23__20231116_HyATAC_MouseCortex_C1.fragments.tsv.gz',
 'HYA__cc24b7__20240304_Mouse_Cortex-c-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__cc24b7__20240304_Mouse_Cortex-c-ATAC.fragments.tsv.gz',
 'HYA__a7a748__20231031_HyDropAtac_MouseCortex_C2': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__a7a748__20231031_HyDropAtac_MouseCortex_C2.fragments.tsv.gz',
 'HYA__167ec9__20240304_Mouse_Cortex-a2-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__167ec9__20240304_Mouse_Cortex-a2-ATAC.fragments.tsv.gz',
 'HYA__247495__20231013_HyDropATAC_MouseCortex_C': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__247495__20231013_HyDropATAC_MouseCortex_C.fragments.tsv.gz',
 'HYA__87f862__20240117_Mouse_Cortex-b-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__87f862__20240117_Mouse_Cortex-b-ATAC.fragments.tsv.gz',
 'HYA__7e2d51__20231031_HyDropAtac_MouseCortex_B1': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__7e2d51__20231031_HyDropAtac_MouseCortex_B1.fragments.tsv.gz',
 'HYA__00c24b__20240117_Mouse_Cortex-c-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__00c24b__20240117_Mouse_Cortex-c-ATAC.fragments.tsv.gz',
 'HYA__fd6250__20240105_Mouse_Cortex-b-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__fd6250__20240105_Mouse_Cortex-b-ATAC.fragments.tsv.gz',
 'HYA__c0d7e5__20240304_Mouse_Cortex-a1-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__c0d7e5__20240304_Mouse_Cortex-a1-ATAC.fragments.tsv.gz',
 'HYA__3e5c73__20231031_HyDropAtac_MouseCortex_D2': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__3e5c73__20231031_HyDropAtac_MouseCortex_D2.fragments.tsv.gz',
 'HYA__9667aa__20231116_HyATAC_MouseCortex_C2': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__9667aa__20231116_HyATAC_MouseCortex_C2.fragments.tsv.gz',
 'HYA__66fdea__20231116_HyATAC_MouseCortex_B1': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__66fdea__20231116_HyATAC_MouseCortex_B1.fragments.tsv.gz',
 'HYA__2eb25d__20240430_Mouse_Cortex-b-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__2eb25d__20240430_Mouse_Cortex-b-ATAC.fragments.tsv.gz',
 'HYA__21de27__20240105_Mouse_Cortex-c-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__21de27__20240105_Mouse_Cortex-c-ATAC.fragments.tsv.gz',
 'HYA__d4ff32__20231116_HyATAC_MouseCortex_A': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__d4ff32__20231116_HyATAC_MouseCortex_A.fragments.tsv.gz',
 'HYA__6b44d3__20240221_Mouse_Cortex-a-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__6b44d3__20240221_Mouse_Cortex-a-ATAC.fragments.tsv.gz',
 'HYA__7a1cc3__20231013_HyDropATAC_MouseCortex_A': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__7a1cc3__20231013_HyDropATAC_MouseCortex_A.fragments.tsv.gz',
 'HYA__0f6aa5__20231031_HyDropAtac_MouseCortex_B2': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__0f6aa5__20231031_HyDropAtac_MouseCortex_B2.fragments.tsv.gz',
} # fragment files themselves are on GEO (not Zenodo)

for key, value in fragments_hydrop_dict.items():
    if not os.path.exists(value):
        raise FileNotFoundError(f"Fragment file {value} does not exist. Please check the path.")

In [6]:
# Get cell barcodes per sample from cistopic_anndata
cistopic_fragments_hydrop_anndata = anndata.read_h5ad(cistopic_fragments_hydrop_anndata_path)
print(
    f"No. of cells in cistopic anndata: {cistopic_fragments_hydrop_anndata.shape[0]}"
)
print(
    f"Unique celltypes in cistopic anndata: {list(cistopic_fragments_hydrop_anndata.obs[celltype_label].unique())}"
)
print(
    f"Total fragments in cistopic anndata: {sum(cistopic_fragments_hydrop_anndata.obs['Total_nr_frag'])}"
)

unique_barcodes_list_of_lists = []
for sample_name, sample_path in fragments_hydrop_dict.items():
    sample_barcodes_list = []
    for index in cistopic_fragments_hydrop_anndata.obs.index:
        barcode, sample = index.split("___")
        if sample == sample_name:
            sample_barcodes_list.append(barcode)
    unique_barcodes_list_of_lists.append(sample_barcodes_list)

assert (
    sum([len(fraglist) for fraglist in unique_barcodes_list_of_lists])
    == cistopic_fragments_hydrop_anndata.obs.shape[0]
), f"Number of barcodes in cistopic fragments anndata: {cistopic_fragments_hydrop_anndata.obs.shape[0]} does not match the number of barcodes in fragments_dict: {sum([len(fraglist) for fraglist in unique_barcodes_list_of_lists])}"

No. of cells in cistopic anndata: 67080
Unique celltypes in cistopic anndata: ['L5_ET', 'L5_IT', 'L6_IT', 'Sncg_Vip', 'L6_CT', 'Sst', 'Oligo', 'OPC', 'Astro', 'L2_3_IT', 'L6b', 'Pvalb', 'Endo_VLMC', 'Lamp5', 'Micro_PVM', 'L5_6_NP']
Total fragments in cistopic anndata: 1635008233


In [7]:
# Import fragments into scprinter object
fragments_paths = [
    fragments_hydrop_dict[sample_name] for sample_name in fragments_hydrop_dict.keys()
]
printer = scp.pp.import_fragments(
    pathToFrags=fragments_paths,
    barcodes=unique_barcodes_list_of_lists,
    savename=printer_hydrop_object_output_path,
    genome=genome,
    min_num_fragments=0,
    min_tsse=0,
    sorted_by_barcode=False,
    low_memory=False,
    auto_detect_shift=False,
)

printer.insertion_file.obs_names = [
    xx + f"{sample}" for xx, sample in zip(printer.obs_names, printer.obs["sample"])
]

time.sleep(1)
printer.close()

100%|██████████| 35/35 [09:22<00:00, 16.08s/it]:19<00:00, 126.25it/s]


start transferring insertions


Downloading file 'mm10Tn5Bias.tar.gz' from 'https://zenodo.org/record/7121027/files/mm10Tn5Bias.tar.gz' to '/data/leuven/358/vsc35862/.cache/scprinter'.
100%|█████████████████████████████████████| 9.77G/9.77G [00:00<00:00, 8.43TB/s]
Untarring contents of '/data/leuven/358/vsc35862/.cache/scprinter/mm10Tn5Bias.tar.gz' to '/data/leuven/358/vsc35862/.cache/scprinter/mm10Tn5Bias.tar.gz.untar'


creating bias bigwig (runs for new bias h5 file)


Importing fragments: 100%|██████████| 35/35 [56:21<00:00, 96.60s/it] 
  xx + f"{sample}" for xx, sample in zip(printer.obs_names, printer.obs["sample"])


## 10x

In [25]:
rel_path = "../../../../../../.."
fragments_10x_dict = {
 'atac_v1_adult_brain_fresh_5k': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/atac_v1_adult_brain_fresh_5k.fragments.tsv.gz',
 'HYA__e9082f__20240430_Mouse_Cortex-d-ATAC': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/HYA__e9082f__20240430_Mouse_Cortex-d-ATAC.fragments.tsv.gz',
 '8k_mouse_cortex_ATACv2_nextgem_Chromium_Controller': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/8k_mouse_cortex_ATACv2_nextgem_Chromium_Controller.fragments.tsv.gz',
 '8k_mouse_cortex_ATACv1-1_nextgem_Chromium_X': f'{rel_path}/hydrop_v2_paper/mcortex/cistopic/mcortex_all_otsu_notdownsampled/fragments/8k_mouse_cortex_ATACv1-1_nextgem_Chromium_X.fragments.tsv.gz'

} # fragment files themselves are on GEO (not Zenodo)

for key, value in fragments_10x_dict.items():
    if not os.path.exists(value):
        raise FileNotFoundError(f"Fragment file {value} does not exist. Please check the path.")

In [26]:
# Get cell barcodes per sample from cistopic_anndata
cistopic_fragments_10x_anndata = anndata.read_h5ad(cistopic_fragments_10x_anndata_path)
print(
    f"No. of cells in cistopic anndata: {cistopic_fragments_10x_anndata.shape[0]}"
)
print(
    f"Unique celltypes in cistopic anndata: {list(cistopic_fragments_10x_anndata.obs[celltype_label].unique())}"
)
print(
    f"Total fragments in cistopic anndata: {sum(cistopic_fragments_10x_anndata.obs['Total_nr_frag'])}"
)

unique_barcodes_list_of_lists = []
for sample_name, sample_path in fragments_10x_dict.items():
    sample_barcodes_list = []
    for index in cistopic_fragments_10x_anndata.obs.index:
        barcode, sample = index.split("___")
        if sample == sample_name:
            sample_barcodes_list.append(barcode)
    unique_barcodes_list_of_lists.append(sample_barcodes_list)

assert (
    sum([len(fraglist) for fraglist in unique_barcodes_list_of_lists])
    == cistopic_fragments_10x_anndata.obs.shape[0]
), f"Number of barcodes in cistopic fragments anndata: {cistopic_fragments_10x_anndata.obs.shape[0]} does not match the number of barcodes in fragments_dict: {sum([len(fraglist) for fraglist in unique_barcodes_list_of_lists])}"

No. of cells in cistopic anndata: 19478
Unique celltypes in cistopic anndata: ['L6_CT', 'L2_3_IT', 'Micro_PVM', 'L5_ET', 'Astro', 'Oligo', 'L5_IT', 'Sncg_Vip', 'Pvalb', 'L6_IT', 'L6b', 'Lamp5', 'Sst', 'OPC', 'L5_6_NP', 'Endo_VLMC']
Total fragments in cistopic anndata: 788042726


In [27]:
# Import fragments into scprinter object
fragments_paths = [
    fragments_10x_dict[sample_name] for sample_name in fragments_10x_dict.keys()
]
printer = scp.pp.import_fragments(
    pathToFrags=fragments_paths,
    barcodes=unique_barcodes_list_of_lists,
    savename=printer_10x_object_output_path,
    genome=genome,
    min_num_fragments=0,
    min_tsse=0,
    sorted_by_barcode=False,
    low_memory=False,
    auto_detect_shift=False,
)

printer.insertion_file.obs_names = [
    xx + f"{sample}" for xx, sample in zip(printer.obs_names, printer.obs["sample"])
]

time.sleep(1)
printer.close()

100%|██████████| 4/4 [08:20<00:00, 125.25s/it]00<00:00, 517.22it/s]


start transferring insertions


Importing fragments: 100%|██████████| 4/4 [16:51<00:00, 252.78s/it]
  xx + f"{sample}" for xx, sample in zip(printer.obs_names, printer.obs["sample"])
