In [69]:
import pandas as pd

benchmark_path = "/cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_12_14/benchmark_guide_metadata_v3.tsv"
production_path = "/cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_12_14/production_guide_metadata_v3.tsv"

benchmark_df = pd.read_csv(benchmark_path, sep="\t")
production_df = pd.read_csv(production_path, sep="\t")

In [70]:
label_order = [
    "positive_control",
    "non_targeting",
    "negative_control",
    "tf_targeting"
]

benchmark_counts = (
    benchmark_df["label"]
    .value_counts()
    .reindex(label_order, fill_value=0)
)

production_counts = (
    production_df["label"]
    .value_counts()
    .reindex(label_order, fill_value=0)
)


In [71]:
label_count_df = pd.DataFrame({
    "benchmark": benchmark_counts,
    "production": production_counts
})

label_count_df

Unnamed: 0,benchmark,production
positive_control,8,19
non_targeting,30,600
negative_control,54,619
tf_targeting,324,12934


In [72]:
cols_to_match = [
    "guide_id",
    "spacer",
    "targeting",
    "type",
    "guide_chr",
    "guide_start",
    "guide_end",
    "strand",
    "pam",
    "genomic_element",
    "intended_target_name",
    "intended_target_chr",
    "intended_target_start",
    "intended_target_end",
    "putative_target_genes",
    "reporter",
    "imperfect",
    "gene_name",
    "label",
]

In [73]:
benchmark_pc = benchmark_df.query("label == 'positive_control'").copy()
production_pc = production_df.query("label == 'positive_control'").copy()


In [74]:
benchmark_pc = benchmark_pc[cols_to_match]
production_pc = production_pc[cols_to_match]

In [75]:
benchmark_pc = benchmark_pc.fillna("NA").astype(str)
production_pc = production_pc.fillna("NA").astype(str)

In [76]:
merged = benchmark_pc.merge(
    production_pc.drop_duplicates(),
    on=cols_to_match,
    how="left",
    indicator=True,
)

missing = merged.query("_merge == 'left_only'")
missing

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,pam,genomic_element,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,putative_target_genes,reporter,imperfect,gene_name,label,_merge
5,NGFRAP1#chrX:103376261-103376279(-),GTTGGAGTTTGCCCTCCTC,True,positive control,chrX,103376261.0,103376279.0,-,NGG,promoter,ENSG00000166681,chrX,103376258.0,103376279.0,,,,NGFRAP1,positive_control,left_only


In [77]:
print(f"Benchmark positive controls: {len(benchmark_pc)}")
print(f"Production positive controls: {len(production_pc)}")

Benchmark positive controls: 8
Production positive controls: 19


In [78]:
dups = production_pc[production_pc.duplicated(subset=cols_to_match, keep=False)]
print(f"Duplicate production positive control rows: {len(dups)}")

Duplicate production positive control rows: 0


In [79]:
benchmark_nt = benchmark_df.query("label == 'non_targeting'").copy()
production_nt = production_df.query("label == 'non_targeting'").copy()

In [80]:
benchmark_nt = benchmark_nt[cols_to_match]
production_nt = production_nt[cols_to_match]

In [81]:
benchmark_nt = benchmark_nt.fillna("NA").astype(str)
production_nt = production_nt.fillna("NA").astype(str)

In [82]:
merged = benchmark_nt.merge(
    production_nt.drop_duplicates(),
    on=cols_to_match,
    how="left",
    indicator=True,
)

missing = merged.query("_merge == 'left_only'")

In [83]:
missing

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,pam,genomic_element,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,putative_target_genes,reporter,imperfect,gene_name,label,_merge


In [84]:
print(f"Benchmark non_targeting guides: {len(benchmark_nt)}")
print(f"Production non_targeting guides: {len(production_nt)}")

if missing.empty:
    print("✅ All benchmark non_targeting guides are present in production (exact match).")
else:
    print(f"❌ {len(missing)} benchmark non_targeting guides are NOT present in production.")


Benchmark non_targeting guides: 30
Production non_targeting guides: 600
✅ All benchmark non_targeting guides are present in production (exact match).


In [85]:
benchmark_neg = benchmark_df.query("label == 'negative_control'").copy()
production_neg = production_df.query("label == 'negative_control'").copy()


In [87]:
bench_genes = set(
    benchmark_neg["gene_name"]
    .dropna()
    .astype(str)
)

prod_genes = set(
    production_neg["gene_name"]
    .dropna()
    .astype(str)
)

In [88]:
common_genes = bench_genes & prod_genes
bench_only_genes = bench_genes - prod_genes
prod_only_genes = prod_genes - bench_genes

In [89]:
common_genes

set()

In [None]:
print("Benchmark negative_control gene_name count:", len(bench_genes))
print("Production negative_control gene_name count:", len(prod_genes))
print("Shared gene_name count:", len(common_genes))
print("Benchmark-only gene_name count:", len(bench_only_genes))
print("Production-only gene_name count:", len(prod_only_genes))

Benchmark negative_control gene_name count: 9
Production negative_control gene_name count: 100
Shared gene_name count: 0
Benchmark-only gene_name count: 9
Production-only gene_name count: 100


In [91]:
benchmark_tf = benchmark_df.query("label == 'tf_targeting'").copy()
production_tf = production_df.query("label == 'tf_targeting'").copy()


In [92]:
bench_tfs = set(
    benchmark_tf["gene_name"]
    .dropna()
    .astype(str)
)

prod_tfs = set(
    production_tf["gene_name"]
    .dropna()
    .astype(str)
)

In [93]:
common_tfs = bench_tfs & prod_tfs
bench_only_tfs = bench_tfs - prod_tfs
prod_only_tfs = prod_tfs - bench_tfs

In [94]:
print("Benchmark tf_targeting gene_name count:", len(bench_tfs))
print("Production tf_targeting gene_name count:", len(prod_tfs))
print("Shared TF gene_name count:", len(common_tfs))
print("Benchmark-only TF gene_name count:", len(bench_only_tfs))
print("Production-only TF gene_name count:", len(prod_only_tfs))

Benchmark tf_targeting gene_name count: 46
Production tf_targeting gene_name count: 1951
Shared TF gene_name count: 40
Benchmark-only TF gene_name count: 6
Production-only TF gene_name count: 1911


In [97]:
bench_only_tfs

{'BMAL1', 'EED', 'JARID2', 'SMARCD1', 'SMARCD2', 'SMARCD3'}

In [95]:
common_tfs

{'AFF4',
 'ARID1A',
 'ARID1B',
 'ARNT',
 'BATF',
 'CLOCK',
 'CREBBP',
 'CTNNB1',
 'EP300',
 'ESRRB',
 'EZH2',
 'HDAC1',
 'HIF1A',
 'HMGA2',
 'KLF4',
 'KLF6',
 'LEF1',
 'MYC',
 'NANOG',
 'NFIB',
 'PAX5',
 'POU5F1',
 'REST',
 'RUNX1',
 'SALL4',
 'SMAD3',
 'SMAD4',
 'SMARCA2',
 'SMARCA4',
 'SMARCB1',
 'SMARCC1',
 'SMARCC2',
 'SOX2',
 'SUZ12',
 'TBX3',
 'TCF12',
 'TCF7',
 'TCF7L1',
 'TCF7L2',
 'TFCP2L1'}

In [98]:
benchmark_tf = benchmark_tf[benchmark_tf["gene_name"].isin(common_tfs)]
production_tf = production_tf[production_tf["gene_name"].isin(common_tfs)]

In [99]:
bench_guides_by_gene = (
    benchmark_tf
    .groupby("gene_name")["guide_id"]
    .apply(lambda x: set(x.astype(str)))
)

prod_guides_by_gene = (
    production_tf
    .groupby("gene_name")["guide_id"]
    .apply(lambda x: set(x.astype(str)))
)

In [100]:
rows = []

for gene in sorted(common_tfs):
    bench_guides = bench_guides_by_gene.get(gene, set())
    prod_guides = prod_guides_by_gene.get(gene, set())

    rows.append({
        "gene_name": gene,
        "benchmark_guides": len(bench_guides),
        "production_guides": len(prod_guides),
        "shared_guides": len(bench_guides & prod_guides),
        "benchmark_only_guides": len(bench_guides - prod_guides),
        "production_only_guides": len(prod_guides - bench_guides),
    })

tf_guide_overlap_df = pd.DataFrame(rows)

In [101]:
tf_guide_overlap_df.sort_values(
    ["benchmark_only_guides", "production_only_guides"],
    ascending=False
)

Unnamed: 0,gene_name,benchmark_guides,production_guides,shared_guides,benchmark_only_guides,production_only_guides
0,AFF4,6,6,6,0,0
1,ARID1A,6,6,6,0,0
2,ARID1B,12,12,12,0,0
3,ARNT,6,6,6,0,0
4,BATF,6,6,6,0,0
5,CLOCK,6,6,6,0,0
6,CREBBP,6,6,6,0,0
7,CTNNB1,6,6,6,0,0
8,EP300,6,6,6,0,0
9,ESRRB,6,6,6,0,0


In [102]:
# Genes where benchmark is a strict subset
tf_guide_overlap_df.query("benchmark_only_guides == 0")

Unnamed: 0,gene_name,benchmark_guides,production_guides,shared_guides,benchmark_only_guides,production_only_guides
0,AFF4,6,6,6,0,0
1,ARID1A,6,6,6,0,0
2,ARID1B,12,12,12,0,0
3,ARNT,6,6,6,0,0
4,BATF,6,6,6,0,0
5,CLOCK,6,6,6,0,0
6,CREBBP,6,6,6,0,0
7,CTNNB1,6,6,6,0,0
8,EP300,6,6,6,0,0
9,ESRRB,6,6,6,0,0


In [103]:
# Genes with discrepancies
tf_guide_overlap_df.query("benchmark_only_guides > 0")

Unnamed: 0,gene_name,benchmark_guides,production_guides,shared_guides,benchmark_only_guides,production_only_guides


In [105]:
cols_to_check = [
    "spacer",
    "targeting",
    "type",
    "guide_chr",
    "guide_start",
    "guide_end",
    "strand",
    "pam",
    "genomic_element",
    "intended_target_name",
    "intended_target_chr",
    "intended_target_start",
    "intended_target_end",
    "putative_target_genes",
    "reporter",
    "imperfect",
    "gene_name",
    "label",
]


In [108]:
benchmark_tf = benchmark_df.query("label == 'tf_targeting'").copy()
production_tf = production_df.query("label == 'tf_targeting'").copy()

shared_ids = (
    set(benchmark_tf["guide_id"].astype(str))
    & set(production_tf["guide_id"].astype(str))
)

benchmark_tf = benchmark_tf[benchmark_tf["guide_id"].astype(str).isin(shared_ids)]
production_tf = production_tf[production_tf["guide_id"].astype(str).isin(shared_ids)]

In [109]:
benchmark_tf = benchmark_tf[["guide_id"] + cols_to_check]
production_tf = production_tf[["guide_id"] + cols_to_check]

benchmark_tf = benchmark_tf.fillna("NA").astype(str)
production_tf = production_tf.fillna("NA").astype(str)

In [110]:
merged = benchmark_tf.merge(
    production_tf,
    on="guide_id",
    how="inner",
    suffixes=("_benchmark", "_production"),
)

In [111]:
mismatch_mask = False

for col in cols_to_check:
    mismatch_mask |= (
        merged[f"{col}_benchmark"] != merged[f"{col}_production"]
    )

mismatches = merged[mismatch_mask]

In [112]:
print("Shared tf_targeting guide_ids:", len(shared_ids))
print("Guides with metadata mismatches:", mismatches["guide_id"].nunique())

Shared tf_targeting guide_ids: 282
Guides with metadata mismatches: 18


In [113]:
mismatches

Unnamed: 0,guide_id,spacer_benchmark,targeting_benchmark,type_benchmark,guide_chr_benchmark,guide_start_benchmark,guide_end_benchmark,strand_benchmark,pam_benchmark,genomic_element_benchmark,...,genomic_element_production,intended_target_name_production,intended_target_chr_production,intended_target_start_production,intended_target_end_production,putative_target_genes_production,reporter_production,imperfect_production,gene_name_production,label_production
204,SMARCB1#chr22:23786987-23787005(+),GCGGCCTGGTCGTCGTCTG,True,targeting,chr22,23786987.0,23787005.0,+,NGG,promoter,...,promoter,ENSG00000275837,chr22,23786987.0,23787393.0,,,,SMARCB1,tf_targeting
205,SMARCB1#chr22:23787375-23787393(+),GCGCGCGCTCGGGGCTGTG,True,targeting,chr22,23787375.0,23787393.0,+,NGG,promoter,...,promoter,ENSG00000275837,chr22,23786987.0,23787393.0,,,,SMARCB1,tf_targeting
206,SMARCB1#chr22:23786990-23787008(+),GCCTGGTCGTCGTCTGCGG,True,targeting,chr22,23786990.0,23787008.0,+,NGG,promoter,...,promoter,ENSG00000275837,chr22,23786987.0,23787393.0,,,,SMARCB1,tf_targeting
207,SMARCB1#chr22:23786994-23787012(-),GCCGCCGCAGACGACGACC,True,targeting,chr22,23786994.0,23787012.0,-,NGG,promoter,...,promoter,ENSG00000275837,chr22,23786987.0,23787393.0,,,,SMARCB1,tf_targeting
208,SMARCB1#chr22:23787045-23787063(-),TGCGGACCGGGCCGGGTAC,True,targeting,chr22,23787045.0,23787063.0,-,NGG,promoter,...,promoter,ENSG00000275837,chr22,23786987.0,23787393.0,,,,SMARCB1,tf_targeting
209,SMARCB1#chr22:23787031-23787049(+),CGGCTGAGGCGCCAGTACC,True,targeting,chr22,23787031.0,23787049.0,+,NGG,promoter,...,promoter,ENSG00000275837,chr22,23786987.0,23787393.0,,,,SMARCB1,tf_targeting
210,SMARCB1#chr22:23791807-23791825(-),CTAGTCGCCTCCAGAGTGA,True,targeting,chr22,23791807.0,23791825.0,-,NGG,promoter,...,promoter,ENSG00000275837,chr22,23791770.0,23791831.0,,,,SMARCB1,tf_targeting
211,SMARCB1#chr22:23791813-23791831(+),TGGAGGCGACTAGCCACTG,True,targeting,chr22,23791813.0,23791831.0,+,NGG,promoter,...,promoter,ENSG00000275837,chr22,23791770.0,23791831.0,,,,SMARCB1,tf_targeting
212,SMARCB1#chr22:23791794-23791812(+),CAAGAGATACCCCTCACTC,True,targeting,chr22,23791794.0,23791812.0,+,NGG,promoter,...,promoter,ENSG00000275837,chr22,23791770.0,23791831.0,,,,SMARCB1,tf_targeting
213,SMARCB1#chr22:23791773-23791791(-),CAGAGAACCTCGGAACATA,True,targeting,chr22,23791773.0,23791791.0,-,NGG,promoter,...,promoter,ENSG00000275837,chr22,23791770.0,23791831.0,,,,SMARCB1,tf_targeting


In [114]:
def diff_columns(row):
    diffs = []
    for col in cols_to_check:
        if row[f"{col}_benchmark"] != row[f"{col}_production"]:
            diffs.append(col)
    return diffs

mismatches["mismatching_columns"] = mismatches.apply(diff_columns, axis=1)

mismatches[["guide_id", "mismatching_columns"]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,guide_id,mismatching_columns
204,SMARCB1#chr22:23786987-23787005(+),[intended_target_name]
205,SMARCB1#chr22:23787375-23787393(+),[intended_target_name]
206,SMARCB1#chr22:23786990-23787008(+),[intended_target_name]
207,SMARCB1#chr22:23786994-23787012(-),[intended_target_name]
208,SMARCB1#chr22:23787045-23787063(-),[intended_target_name]
209,SMARCB1#chr22:23787031-23787049(+),[intended_target_name]
210,SMARCB1#chr22:23791807-23791825(-),[intended_target_name]
211,SMARCB1#chr22:23791813-23791831(+),[intended_target_name]
212,SMARCB1#chr22:23791794-23791812(+),[intended_target_name]
213,SMARCB1#chr22:23791773-23791791(-),[intended_target_name]
