# Generate BarcodeGroups (barcodes per cell type grouping)

In [1]:
import os
import json
import time

import anndata
import scprinter as scp
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
zip_folder = "../../../../../../data/hydrop/mcortex/paper_zips"
celltype_label = "cleaned_up_annot"  # annotation column name

In [3]:
genome = scp.genome.mm10

## Hydrop

In [4]:
cistopic_fragments_anndata_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/cistopic_fragments_mouse_hydropv2_anndata.h5ad"
printer_object_path =  f"{zip_folder}/10x_hydropv2_comparisons_data/printer/printer_mouse_hydropv2.h5ad"
barcodegroups_out_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/barcodegroups_mouse_hydropv2.csv"

In [5]:
# load objects
printer = scp.load_printer(printer_object_path, genome)
cistopic_anndata = anndata.read_h5ad(cistopic_fragments_anndata_path)

printer_barcodes = []
printer_parts = []

for name in printer.obs_names:
    if "part" in name:
        barcode, part = name.rsplit("part", 1)
        printer_barcodes.append(barcode)
        printer_parts.append(part)
    else:
        # Handle cases where 'part' is not in name
        barcode = name
        part = None
        printer_barcodes.append(barcode)
        printer_parts.append(part)

printer_df = pd.DataFrame(
    {
        "obs_name": printer.obs_names,
        "barcode": printer_barcodes,
        "part": printer_parts,
    }
)

In [7]:
# Extract barcodes and sample names from `cistopic_anndata.obs.index`
cistopic_barcodes = []
cistopic_samples = []

for idx in cistopic_anndata.obs.index:
    parts = idx.split("___")
    barcode = parts[0]
    sample = parts[-1]  # Assuming sample name is the last part
    cistopic_barcodes.append(barcode)
    cistopic_samples.append(sample)

cistopic_df = pd.DataFrame(
    {
        "index": cistopic_anndata.obs.index,
        "barcode": cistopic_barcodes,
        "sample": cistopic_samples,
        "cell_type": cistopic_anndata.obs[celltype_label].values,
    }
)

In [8]:
cistopic_df.head(3)

Unnamed: 0,index,barcode,sample,cell_type
0,CTCGATATCTACCAAGCGAAGGATGAGGAG___HYA__ac683b__20230829_mouse_cortex_1,CTCGATATCTACCAAGCGAAGGATGAGGAG,HYA__ac683b__20230829_mouse_cortex_1,L5_ET
1,TTATTGGCACCTGTTGAGATGGACGAACGG___HYA__ac683b__20230829_mouse_cortex_1,TTATTGGCACCTGTTGAGATGGACGAACGG,HYA__ac683b__20230829_mouse_cortex_1,L5_IT
2,TCTGATACACGATAACCACATAGTGGTAAT___HYA__ac683b__20230829_mouse_cortex_1,TCTGATACACGATAACCACATAGTGGTAAT,HYA__ac683b__20230829_mouse_cortex_1,L6_IT


In [9]:
# Group barcodes by part number and by sample name
barcodes_by_part = printer_df.groupby("part")["barcode"].apply(set).to_dict()
barcodes_by_sample = cistopic_df.groupby("sample")["barcode"].apply(set).to_dict()

# Find the best matching sample for each part by computing the overlap of barcodes
part_to_sample = {}
for part, part_barcodes in barcodes_by_part.items():
    max_overlap = 0
    best_sample = None
    for sample, sample_barcodes in barcodes_by_sample.items():
        overlap = len(part_barcodes & sample_barcodes)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sample = sample
    if best_sample:
        part_to_sample[part] = best_sample
    else:
        print(f"No matching sample found for part {part}")

# Map each part to its corresponding sample
printer_df["sample"] = printer_df["part"].map(part_to_sample)

# Remove entries where sample mapping was not found
printer_df = printer_df.dropna(subset=["sample"])

In [10]:
printer_df.head(3)

Unnamed: 0,obs_name,barcode,part,sample
0,AACAACCATCCCACGCTATTCTTCACCTTCpart2,AACAACCATCCCACGCTATTCTTCACCTTC,2,HYA__177788__20240117_Mouse_Cortex-d-ATAC
1,AAGCAGATAGAAGGTGTGGTATCGACCTTGpart2,AAGCAGATAGAAGGTGTGGTATCGACCTTG,2,HYA__177788__20240117_Mouse_Cortex-d-ATAC
2,AAGCAGATAGAAGGTGTGGTGAACACTGGApart2,AAGCAGATAGAAGGTGTGGTGAACACTGGA,2,HYA__177788__20240117_Mouse_Cortex-d-ATAC


In [11]:
# Merge the datasets using both `barcode` and `sample`
merged_df = pd.merge(printer_df, cistopic_df, on=["barcode", "sample"], how="inner")
merged_df.head(3)

Unnamed: 0,obs_name,barcode,part,sample,index,cell_type
0,AACAACCATCCCACGCTATTCTTCACCTTCpart2,AACAACCATCCCACGCTATTCTTCACCTTC,2,HYA__177788__20240117_Mouse_Cortex-d-ATAC,AACAACCATCCCACGCTATTCTTCACCTTC___HYA__177788__20240117_Mouse_Cortex-d-ATAC,Micro_PVM
1,AAGCAGATAGAAGGTGTGGTATCGACCTTGpart2,AAGCAGATAGAAGGTGTGGTATCGACCTTG,2,HYA__177788__20240117_Mouse_Cortex-d-ATAC,AAGCAGATAGAAGGTGTGGTATCGACCTTG___HYA__177788__20240117_Mouse_Cortex-d-ATAC,OPC
2,AAGCAGATAGAAGGTGTGGTGAACACTGGApart2,AAGCAGATAGAAGGTGTGGTGAACACTGGA,2,HYA__177788__20240117_Mouse_Cortex-d-ATAC,AAGCAGATAGAAGGTGTGGTGAACACTGGA___HYA__177788__20240117_Mouse_Cortex-d-ATAC,Pvalb


In [12]:
# Generate the final CSV file
merged_df[["obs_name", "cell_type"]].to_csv(barcodegroups_out_path, sep="\t", index=False)
time.sleep(1)
printer.close()

## 10x

In [13]:
cistopic_fragments_anndata_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/cistopic_fragments_mouse_10x_anndata.h5ad"
printer_object_path =  f"{zip_folder}/10x_hydropv2_comparisons_data/printer/printer_mouse_10x.h5ad"
barcodegroups_out_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/barcodegroups_mouse_10x.csv"

In [14]:
# load objects
printer = scp.load_printer(printer_object_path, genome)
cistopic_anndata = anndata.read_h5ad(cistopic_fragments_anndata_path)

printer_barcodes = []
printer_parts = []

for name in printer.obs_names:
    if "part" in name:
        barcode, part = name.rsplit("part", 1)
        printer_barcodes.append(barcode)
        printer_parts.append(part)
    else:
        # Handle cases where 'part' is not in name
        barcode = name
        part = None
        printer_barcodes.append(barcode)
        printer_parts.append(part)

printer_df = pd.DataFrame(
    {
        "obs_name": printer.obs_names,
        "barcode": printer_barcodes,
        "part": printer_parts,
    }
)

In [15]:
# Extract barcodes and sample names from `cistopic_anndata.obs.index`
cistopic_barcodes = []
cistopic_samples = []

for idx in cistopic_anndata.obs.index:
    parts = idx.split("___")
    barcode = parts[0]
    sample = parts[-1]  # Assuming sample name is the last part
    cistopic_barcodes.append(barcode)
    cistopic_samples.append(sample)

cistopic_df = pd.DataFrame(
    {
        "index": cistopic_anndata.obs.index,
        "barcode": cistopic_barcodes,
        "sample": cistopic_samples,
        "cell_type": cistopic_anndata.obs[celltype_label].values,
    }
)

In [16]:
# Group barcodes by part number and by sample name
barcodes_by_part = printer_df.groupby("part")["barcode"].apply(set).to_dict()
barcodes_by_sample = cistopic_df.groupby("sample")["barcode"].apply(set).to_dict()

# Find the best matching sample for each part by computing the overlap of barcodes
part_to_sample = {}
for part, part_barcodes in barcodes_by_part.items():
    max_overlap = 0
    best_sample = None
    for sample, sample_barcodes in barcodes_by_sample.items():
        overlap = len(part_barcodes & sample_barcodes)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sample = sample
    if best_sample:
        part_to_sample[part] = best_sample
    else:
        print(f"No matching sample found for part {part}")

# Map each part to its corresponding sample
printer_df["sample"] = printer_df["part"].map(part_to_sample)

# Remove entries where sample mapping was not found
printer_df = printer_df.dropna(subset=["sample"])

In [18]:
# Merge the datasets using both `barcode` and `sample`
merged_df = pd.merge(printer_df, cistopic_df, on=["barcode", "sample"], how="inner")

# Generate the final CSV file
merged_df[["obs_name", "cell_type"]].to_csv(barcodegroups_out_path, sep="\t", index=False)
time.sleep(1)
printer.close()