# Generate BarcodeGroups (barcodes per cell type grouping)

In [7]:
import os
import json
import time

import anndata
import scprinter as scp
import pandas as pd

In [2]:
zip_folder = "../../../../../../data/hydrop/fly_embryo/paper_zips"
celltype_label = "final_embryo_annot_atlas_05112024"

fa_file_path = "../../../../../../../../../res_00001/genomes/drosophila_melanogaster/dm6_cellranger/indexes/bwa2/2.2.1/genome.fa"
precomputed_bias_path = "../../../../../../data/PRINT/biases/dm6Tn5Bias.tar.gz.untar/dm6Tn5Bias.h5" # download from scprinter Zenodo or create one with scp.genome.predict_genome_tn5_bias

In [4]:
genome = scp.genome.Genome(
    name="dm6",
    fa_file=fa_file_path,
    bias_file=precomputed_bias_path,
)

## Hydrop

In [8]:
cistopic_fragments_anndata_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/cistopic_fragments_fly_hydropv2_anndata.h5ad" # barcodes and annotations
printer_object_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/printer_fly_hydropv2.h5ad"
barcodegroups_out_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/barcodegroups_fly_hydropv2.csv"

In [9]:
# load objects
printer = scp.load_printer(printer_object_path, genome)
cistopic_anndata = anndata.read_h5ad(cistopic_fragments_anndata_path)

printer_barcodes = []
printer_parts = []

for name in printer.obs_names:
    if "part" in name:
        barcode, part = name.rsplit("part", 1)
        printer_barcodes.append(barcode)
        printer_parts.append(part)
    else:
        # Handle cases where 'part' is not in name
        barcode = name
        part = None
        printer_barcodes.append(barcode)
        printer_parts.append(part)

printer_df = pd.DataFrame(
    {
        "obs_name": printer.obs_names,
        "barcode": printer_barcodes,
        "part": printer_parts,
    }
)

In [23]:
# Extract barcodes and sample names from `cistopic_anndata.obs.index`
cistopic_barcodes = []
cistopic_samples = []

for idx in cistopic_anndata.obs.index:
    parts = idx.split("___")
    barcode = parts[0]
    sample = parts[-1]  # Assuming sample name is the last part
    cistopic_barcodes.append(barcode)
    cistopic_samples.append(sample)

cistopic_df = pd.DataFrame(
    {
        "index": cistopic_anndata.obs.index,
        "barcode": cistopic_barcodes,
        "sample": cistopic_samples,
        "cell_type": cistopic_anndata.obs[celltype_label].values,
    }
)

In [25]:
cistopic_df.head(3)

Unnamed: 0,index,barcode,sample,cell_type
0,CTCGATATCTGCTCGTTCATGATCTAGGTA___FDM__0a2768__240229_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_Dechor_sample_X3,CTCGATATCTGCTCGTTCATGATCTAGGTA,FDM__0a2768__240229_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_Dechor_sample_X3,Neuronal
1,CTACGCTGATCTGATTATGTTAATCGCACA___FDM__0a2768__240229_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_Dechor_sample_X3,CTACGCTGATCTGATTATGTTAATCGCACA,FDM__0a2768__240229_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_Dechor_sample_X3,Neuronal
2,GTTACCGCTCGGACCAAGTGTGGCTTAACA___FDM__0a2768__240229_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_Dechor_sample_X3,GTTACCGCTCGGACCAAGTGTGGCTTAACA,FDM__0a2768__240229_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_Dechor_sample_X3,Tracheal_system


In [26]:
# Group barcodes by part number and by sample name
barcodes_by_part = printer_df.groupby("part")["barcode"].apply(set).to_dict()
barcodes_by_sample = cistopic_df.groupby("sample")["barcode"].apply(set).to_dict()

# Find the best matching sample for each part by computing the overlap of barcodes
part_to_sample = {}
for part, part_barcodes in barcodes_by_part.items():
    max_overlap = 0
    best_sample = None
    for sample, sample_barcodes in barcodes_by_sample.items():
        overlap = len(part_barcodes & sample_barcodes)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sample = sample
    if best_sample:
        part_to_sample[part] = best_sample
    else:
        print(f"No matching sample found for part {part}")

# Map each part to its corresponding sample
printer_df["sample"] = printer_df["part"].map(part_to_sample)

# Remove entries where sample mapping was not found
printer_df = printer_df.dropna(subset=["sample"])

In [27]:
printer_df.head(3)

Unnamed: 0,obs_name,barcode,part,sample
0,AAGCAGATAGACGTTCGCACAGAGGAATCCpart2,AAGCAGATAGACGTTCGCACAGAGGAATCC,2,FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4
1,AAGCAGATAGATCGCTGCTCGATCTAGGTApart2,AAGCAGATAGATCGCTGCTCGATCTAGGTA,2,FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4
2,AAGCAGATAGCCGTACCTATTGAATCCAAGpart2,AAGCAGATAGCCGTACCTATTGAATCCAAG,2,FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4


In [28]:
# Merge the datasets using both `barcode` and `sample`
merged_df = pd.merge(printer_df, cistopic_df, on=["barcode", "sample"], how="inner")
merged_df.head(3)

Unnamed: 0,obs_name,barcode,part,sample,index,cell_type
0,AAGCAGATAGACGTTCGCACAGAGGAATCCpart2,AAGCAGATAGACGTTCGCACAGAGGAATCC,2,FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4,AAGCAGATAGACGTTCGCACAGAGGAATCC___FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4,Neuronal
1,AAGCAGATAGATCGCTGCTCGATCTAGGTApart2,AAGCAGATAGATCGCTGCTCGATCTAGGTA,2,FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4,AAGCAGATAGATCGCTGCTCGATCTAGGTA___FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4,Neuronal
2,AAGCAGATAGCCGTACCTATTGAATCCAAGpart2,AAGCAGATAGCCGTACCTATTGAATCCAAG,2,FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4,AAGCAGATAGCCGTACCTATTGAATCCAAG___FDM__8521d7__240208_HyDropATAC_Dros_DGRP_Embryo_16_20AEL_sampleV4,Epidermis


In [29]:
# Generate the final CSV file
merged_df[["obs_name", "cell_type"]].to_csv(barcodegroups_out_path, sep="\t", index=False)
time.sleep(1)
printer.close()

## 10x

In [30]:
cistopic_fragments_anndata_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/cistopic_fragments_fly_10x_anndata.h5ad" # barcodes and annotations
printer_object_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/printer_fly_10x.h5ad"
barcodegroups_out_path = f"{zip_folder}/10x_hydropv2_comparisons_data/printer/barcodegroups_fly_10x.csv"

In [31]:
# load objects
printer = scp.load_printer(printer_object_path, genome)
cistopic_anndata = anndata.read_h5ad(cistopic_fragments_anndata_path)

printer_barcodes = []
printer_parts = []

for name in printer.obs_names:
    if "part" in name:
        barcode, part = name.rsplit("part", 1)
        printer_barcodes.append(barcode)
        printer_parts.append(part)
    else:
        # Handle cases where 'part' is not in name
        barcode = name
        part = None
        printer_barcodes.append(barcode)
        printer_parts.append(part)

printer_df = pd.DataFrame(
    {
        "obs_name": printer.obs_names,
        "barcode": printer_barcodes,
        "part": printer_parts,
    }
)

In [32]:
# Extract barcodes and sample names from `cistopic_anndata.obs.index`
cistopic_barcodes = []
cistopic_samples = []

for idx in cistopic_anndata.obs.index:
    parts = idx.split("___")
    barcode = parts[0]
    sample = parts[-1]  # Assuming sample name is the last part
    cistopic_barcodes.append(barcode)
    cistopic_samples.append(sample)

cistopic_df = pd.DataFrame(
    {
        "index": cistopic_anndata.obs.index,
        "barcode": cistopic_barcodes,
        "sample": cistopic_samples,
        "cell_type": cistopic_anndata.obs[celltype_label].values,
    }
)

In [36]:
# Group barcodes by part number and by sample name
barcodes_by_part = printer_df.groupby("part")["barcode"].apply(set).to_dict()
barcodes_by_sample = cistopic_df.groupby("sample")["barcode"].apply(set).to_dict()

# Find the best matching sample for each part by computing the overlap of barcodes
part_to_sample = {}
for part, part_barcodes in barcodes_by_part.items():
    max_overlap = 0
    best_sample = None
    for sample, sample_barcodes in barcodes_by_sample.items():
        overlap = len(part_barcodes & sample_barcodes)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sample = sample
    if best_sample:
        part_to_sample[part] = best_sample
    else:
        print(f"No matching sample found for part {part}")

# Map each part to its corresponding sample
printer_df["sample"] = printer_df["part"].map(part_to_sample)

# Remove entries where sample mapping was not found
printer_df = printer_df.dropna(subset=["sample"])

In [37]:
# Merge the datasets using both `barcode` and `sample`
merged_df = pd.merge(printer_df, cistopic_df, on=["barcode", "sample"], how="inner")

# Generate the final CSV file
merged_df[["obs_name", "cell_type"]].to_csv(barcodegroups_out_path, sep="\t", index=False)
time.sleep(1)
printer.close()