# Set-up

In [153]:
import os
import pandas as pd
import numpy as np
from mygene import MyGeneInfo
import re
import requests
import pyranges as pr

In [154]:
# Example — change this to the full URL of the GTF file
url = "https://api.data.igvf.org/reference-files/IGVFFI9573KOZR/@@download/IGVFFI9573KOZR.gtf.gz"
out_path = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/IGVFFI9573KOZR.gtf.gz"

# Stream download to avoid memory issues, only if doesn't already exist
if not os.path.exists(out_path):
    print("Downloading", url)
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
elif os.path.getsize(out_path) > 0:
    print("File already exists and is non-empty, skipping download.")
else:
    print("File exists but is empty, re-downloading.")
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

print("Downloaded to", out_path)

File already exists and is non-empty, skipping download.
Downloaded to /cellar/users/aklie/data/datasets/tf_perturb_seq/ref/IGVFFI9573KOZR.gtf.gz


In [None]:
# Filter to just gene-level annotations
gtf_pr = pr.read_gtf(out_path)
genes_pr = gtf_pr[(gtf_pr.Feature == "gene") & (gtf_pr.gene_type == "protein_coding")]
genes_pr

  return {k: v for k, v in df.groupby(grpby_key)}


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_support_level,havana_transcript,exon_number,exon_id,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
0,chr1,HAVANA,gene,11868,14409,.,+,.,ENSG00000290825.1,lncRNA,...,,,,,,,,,,
1,chr1,HAVANA,gene,12009,13670,.,+,.,ENSG00000223972.6,transcribed_unprocessed_pseudogene,...,,,,,HGNC:37102,OTTHUMG00000000961.2,,,,
2,chr1,HAVANA,gene,29553,31109,.,+,.,ENSG00000243485.5,lncRNA,...,,,,,HGNC:52482,OTTHUMG00000000959.2,,,,
3,chr1,ENSEMBL,gene,30365,30503,.,+,.,ENSG00000284332.1,miRNA,...,,,,,HGNC:35294,,,,,
4,chr1,HAVANA,gene,52472,53312,.,+,.,ENSG00000268020.3,unprocessed_pseudogene,...,,,,,HGNC:14822,OTTHUMG00000185779.1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67566,chrY,HAVANA,gene,57015104,57016096,.,-,.,ENSG00000237801.6_PAR_Y,processed_pseudogene,...,,,,,HGNC:460,OTTHUMG00000022673.2,,,,
67567,chrY,HAVANA,gene,57165511,57165845,.,-,.,ENSG00000228410.6_PAR_Y,processed_pseudogene,...,,,,,HGNC:38160,OTTHUMG00000040493.1,,,,
67568,chrY,HAVANA,gene,57171889,57172769,.,-,.,ENSG00000223484.7_PAR_Y,processed_pseudogene,...,,,,,HGNC:23270,OTTHUMG00000022681.1,,,,
67569,chrY,HAVANA,gene,57201142,57203357,.,-,.,ENSG00000185203.12_PAR_Y,lncRNA,...,,,,,HGNC:38513,OTTHUMG00000022676.3,,,,


In [53]:
# --- Define promoter coordinates (2kb upstream of TSS) ---
genes_df = genes_pr.df.copy()

# Compute start and end carefully depending on strand
promoters_df = pd.DataFrame({
    "Chromosome": genes_df["Chromosome"],
    "Strand": genes_df["Strand"],
    "gene_id": genes_df["gene_id"],
    "gene_name": genes_df["gene_name"],
})

# Compute Start and End for promoters
promoters_df["Start"] = promoters_df.apply(
    lambda x: max(0, genes_df.loc[x.name, "Start"] - 2000)
    if x.Strand == "+"
    else genes_df.loc[x.name, "End"],
    axis=1,
)
promoters_df["End"] = promoters_df.apply(
    lambda x: genes_df.loc[x.name, "Start"]
    if x.Strand == "+"
    else genes_df.loc[x.name, "End"] + 2000,
    axis=1,
)

# Convert to integers
promoters_df[["Start", "End"]] = promoters_df[["Start", "End"]].astype(int)

# Check that it looks correct
promoters_pr = pr.PyRanges(promoters_df)
promoters_pr

  return {k: v for k, v in df.groupby(grpby_key)}


Unnamed: 0,Chromosome,Strand,gene_id,gene_name,Start,End
0,chr1,+,ENSG00000186092.7,OR4F5,63418,65418
1,chr1,+,ENSG00000187634.13,SAMD11,921922,923922
2,chr1,+,ENSG00000187961.15,KLHL17,958583,960583
3,chr1,+,ENSG00000187583.11,PLEKHN1,964481,966481
4,chr1,+,ENSG00000187608.10,ISG15,999137,1001137
...,...,...,...,...,...,...
22176,chrY,-,ENSG00000169800.14,RBMY1F,22182982,22184982
22177,chrY,-,ENSG00000188120.16,DAZ1,23199010,23201010
22178,chrY,-,ENSG00000172352.5,CDY1B,24048019,24050019
22179,chrY,-,ENSG00000187191.16,DAZ3,24813492,24815492


# Load reference metadata

## Reference TF guides

In [303]:
path_ref_tf_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/target_genes.tsv"
spacer_cols = ['Set A, Photospacer 1', 'Set A, Photospacer 2', 'Set B, Photospacer 3', 'Set B, Photospacer 4', 'Set C, Photospacer 5', 'Set C, Photospacer 6']
ref_tf = pd.read_csv(path_ref_tf_metadata, sep="\t")
ref_tf["Gene"] = ref_tf["Gene"].ffill()
for col in spacer_cols:
    ref_tf[col] = ref_tf[col].str.upper()

# Define mapping between sets, promoter columns, and photospacer columns
set_map = {
    "A": ("Set A, promoter ID", ["Set A, Photospacer 1", "Set A, Photospacer 2"]),
    "B": ("Set B, promoter ID", ["Set B, Photospacer 3", "Set B, Photospacer 4"]),
    "C": ("Set C, promoter ID", ["Set C, Photospacer 5", "Set C, Photospacer 6"]),
}

# Collect rows
rows = []
for _, row in ref_tf.iterrows():
    gene = row["Gene"]
    for set_name, (prom_col, spacer_cols) in set_map.items():
        promoter_id = row[prom_col]
        for spacer_col in spacer_cols:
            spacer_seq = row[spacer_col]
            if pd.notna(spacer_seq):
                # Extract photospacer number from column name
                match = re.search(r"Photospacer (\d+)", spacer_col)
                spacer_num = int(match.group(1)) if match else None
                rows.append({
                    "Gene": gene,
                    "promoter_ID": promoter_id,
                    "Set": set_name,
                    "Photospacer": spacer_seq,
                    "Photospacer_number": spacer_num,
                })

# Create the long df
ref_tf_long = pd.DataFrame(rows)[["Gene", "promoter_ID", "Set", "Photospacer", "Photospacer_number"]]

# Make a unique guide ID based on promoter_ID#Set_{Set}#Photospacer_{Photospacer_number}
ref_tf_long["guide_id"] = ref_tf_long.apply(lambda x: f"{x['promoter_ID']}#Set_{x['Set']}#Photospacer_{x['Photospacer_number']}", axis=1)
ref_tf_long.head()

Unnamed: 0,Gene,promoter_ID,Set,Photospacer,Photospacer_number,guide_id
0,AATF,AATF_-_35306286.23-P1P2,A,GAGTGGCCGGTCCAGAGCTG,1,AATF_-_35306286.23-P1P2#Set_A#Photospacer_1
1,AATF,AATF_-_35306286.23-P1P2,A,GGGATCAAGGCGAGAGGATC,2,AATF_-_35306286.23-P1P2#Set_A#Photospacer_2
2,AATF,AATF_-_35306351.23-P1P2,B,GAAGGCGAGAGGATCCGGCA,3,AATF_-_35306351.23-P1P2#Set_B#Photospacer_3
3,AATF,AATF_-_35306351.23-P1P2,B,GGGAATCGGATCAAGGCGAG,4,AATF_-_35306351.23-P1P2#Set_B#Photospacer_4
4,AATF,AATF_-_35306333.23-P1P2,C,GGAGTCGGGGAATCGGATCA,5,AATF_-_35306333.23-P1P2#Set_C#Photospacer_5


## Reference positive controls

In [304]:
def clean_gene_and_make_guide(x):
    x = x.strip()
    
    # --- Remove parentheses in gene names like "CD29 (ITGB1)" ---
    base = re.sub(r"\s*\([^)]*\)", "", x).strip()
    
    # --- Handle sgRNA naming ---
    if "sgRNA" in x:
        base_gene = x.split("sgRNA")[0].strip()
        suffix = x.split("sgRNA")[-1].strip().lower()
        if suffix == "main":
            guide_id = f"{base_gene}#A"
        elif suffix in ["a", "b", "c"]:
            guide_id = f"{base_gene}#{suffix.upper()}"
        else:
            guide_id = f"{base_gene}#A"
        return pd.Series([base_gene, guide_id])
    
    # --- Handle "strong"/"weak" ---
    if re.search(r"\bstrong\b|\bweak\b", x, re.IGNORECASE):
        base_gene = re.sub(r"\s*(strong|weak)\s*", "", base, flags=re.IGNORECASE)
        strength = "strong" if "strong" in x.lower() else "weak"
        guide_id = f"{base_gene}#{strength}"
        return pd.Series([base_gene, guide_id])
    
    # --- Default case: use cleaned base name ---
    return pd.Series([base, f"{base}#A"])

# Load reference 
path_ref_pc_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/positive_controls.tsv"
ref_pcs = pd.read_csv(path_ref_pc_metadata, sep="\t")
ref_pcs = ref_pcs.rename(columns={"Photospacer (represent 10 times)": "Photospacer"})
ref_pcs[["Gene", "guide_id"]] = ref_pcs["Gene"].apply(clean_gene_and_make_guide)
ref_pcs.head()

Unnamed: 0,Gene,Photospacer,Reference,guide_id
0,CD81,GGAGAGCGAGCGCGCAACGG,"Horlbeck et al. 2016 ""Compact and highly activ...",CD81#strong
1,CD81,GGAGAGCCAGCGCGCAACGG,"Jost et al. 2020 ""Titrating gene expression us...",CD81#weak
2,CD151,GCCGGACTCGGACGCGTGGT,"Horlbeck et al. 2016 ""Compact and highly activ...",CD151#strong
3,CD151,GCCGCTCGGCCGAGCTGTCG,"Horlbeck et al. 2016 ""Compact and highly activ...",CD151#weak
4,CD55,GCTGCGACTCGGCGGAGTCC,"Horlbeck et al. 2016 ""Compact and highly activ...",CD55#strong


## Reference non-targeting controls

In [305]:
path_ref_nt_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/non_targeting.tsv"
ref_nt = pd.read_csv(path_ref_nt_metadata, sep="\t")
ref_nt = ref_nt.rename(columns={"Unnamed: 0": "guide_id", "Photospacer (same for all 3 sets)": "Photospacer"})
ref_nt["Gene"] = "non-targeting"
ref_nt.head()

Unnamed: 0,guide_id,Photospacer,Gene
0,non-targeting_00642,GGAGTTAAGGCCTCGTCTAG,non-targeting
1,non-targeting_00718,GTCCCAGGCTCTCCACTATG,non-targeting
2,non-targeting_03631,GGACGCGTCTGCAAGAACGT,non-targeting
3,non-targeting_03705,GGGCATGGACCCGCGGCACG,non-targeting
4,non-targeting_01469,GCGTCCGAGGTACTGAATAA,non-targeting


## Reference negative controls (targeting)

In [306]:
# ---------- 1. Base reference ----------
path_ref_nc_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/negative_controls.tsv"
ref_nc = pd.read_csv(path_ref_nc_metadata, sep="\t")

# Standardize case
spacer_cols = [c for c in ref_nc.columns if c.startswith("Photospacer")]
for col in spacer_cols:
    ref_nc[col] = ref_nc[col].str.upper()

# Melt to long format
ref_nc_long = ref_nc.melt(
    id_vars=["Gene"],
    value_vars=spacer_cols,
    var_name="Photospacer_label",
    value_name="Photospacer"
)
ref_nc_long["Photospacer_number"] = ref_nc_long["Photospacer_label"].str.extract(r"(\d+)").astype(int)
ref_nc_long["guide_id"] = ref_nc_long.apply(
    lambda x: f"{x['Gene']}#Set_A#Photospacer_{x['Photospacer_number']}", axis=1
)
ref_nc_long = ref_nc_long[["Gene", "Photospacer", "Photospacer_number", "guide_id"]]

# ---------- 2. Single-protospacer negatives ----------
path_simple_nc = "/cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_10_05/2025_10_05_negative_controls.tsv"
ref_nc_single = pd.read_csv(path_simple_nc, sep="\t")

# Normalize column names and create unified structure
ref_nc_single = (
    ref_nc_single.rename(columns={"gene": "Gene", "protospacer_A": "Photospacer"})
    .assign(Photospacer_number=1)
)
ref_nc_single["Photospacer"] = ref_nc_single["Photospacer"].str.upper()
ref_nc_single["guide_id"] = ref_nc_single.apply(
    lambda x: f"{x['Gene']}#Set_A#Photospacer_{x['Photospacer_number']}", axis=1
)
ref_nc_single = ref_nc_single[["Gene", "Photospacer", "Photospacer_number", "guide_id"]]

# ---------- 3. Multi-set negatives ----------
path_more_nc = "/cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_10_05/2025_10_05_more_negative_controls.tsv"
ref_nc_multi = pd.read_csv(path_more_nc, sep="\t")

# Identify photospacer columns automatically
photospacer_cols = [c for c in ref_nc_multi.columns if "Photospacer" in c]

# Melt to long format
ref_nc_multi_long = ref_nc_multi.melt(
    id_vars=["Label"], value_vars=photospacer_cols,
    var_name="Photospacer_label", value_name="Photospacer"
)
ref_nc_multi_long["Gene"] = ref_nc_multi_long["Label"]
ref_nc_multi_long["Photospacer"] = ref_nc_multi_long["Photospacer"].str.upper()

# Extract number and set info
ref_nc_multi_long["Photospacer_number"] = ref_nc_multi_long["Photospacer_label"].str.extract(r"(\d+)").astype(int)
ref_nc_multi_long["Set"] = ref_nc_multi_long["Photospacer_label"].str.extract(r"(Set [A-Z])")

# Create guide_id harmonized with the same pattern
ref_nc_multi_long["guide_id"] = ref_nc_multi_long.apply(
    lambda x: f"{x['Gene']}#{x['Set'].replace(' ', '_')}#Photospacer_{x['Photospacer_number']}", axis=1
)

ref_nc_multi_long = ref_nc_multi_long[["Gene", "Photospacer", "Photospacer_number", "guide_id"]]

# ---------- 4. Concatenate & finalize ----------
ref_nc_all = pd.concat([ref_nc_long, ref_nc_single, ref_nc_multi_long], ignore_index=True)
ref_nc_all = ref_nc_all.drop_duplicates().sort_values(["Gene", "Photospacer_number"]).reset_index(drop=True)

ref_nc_all.head()

Unnamed: 0,Gene,Photospacer,Photospacer_number,guide_id
0,OR10A2,GTTCCTGTAGCTATAAGTGT,1,OR10A2#Set_A#Photospacer_1
1,OR10A3,GTTCCTCCTGAACCTATCTG,1,OR10A3#Set_A#Photospacer_1
2,OR10A4,GGCCTGAAGCTCAGTGGACA,1,OR10A4#Set_A#Photospacer_1
3,OR10A5,GCTGGAGAGAGATTCTTGAT,1,OR10A5#Set_A#Photospacer_1
4,OR10A6,GGGTCACCAGATAAATAACC,1,OR10A6#Set_A#Photospacer_1


# Load guide metadata

In [445]:
path_guide_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/datasets/Hon_WTC11-benchmark_TF-Perturb-seq/downloads/IGVFFI5765HMZH.tsv"
guide_metadata = pd.read_csv(path_guide_metadata)
guide_metadata["Photospacer"] = "G" + guide_metadata["spacer"].str.upper()
print(f"{len(guide_metadata)} total guides")
guide_metadata.head()

415 total guides


Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer
0,chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963540,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGGCGGGTTAACGAAGACC
1,chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963579,-,NGG,ENSG00000072364,chr5,132963540,132963638,GTCGCCGCCGCCAGCGGACG
2,chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963567,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGATCCCCGCCCCGTCCGC
3,chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963620,+,NGG,ENSG00000072364,chr5,132963540,132963638,GAGGGCTGTGACTGACGCAG
4,chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963570,+,NGG,ENSG00000072364,chr5,132963540,132963638,GTCCCCGCCCCGTCCGCTGG


## Fix missing target names

In [446]:
len(guide_metadata[guide_metadata["intended_target_name"].isna()])

42

In [447]:
# Convert guides to PyRanges
guides_pr = pr.PyRanges(
    df=guide_metadata.rename(columns={
        'intended_target_chr': 'Chromosome',
        'intended_target_start': 'Start',
        'intended_target_end': 'End',
        'guide_id': 'guide_id'
    })
)

# Find overlaps with gene regions
overlaps = guides_pr.join(genes_pr, suffix="_gene")

# Convert to DataFrame
overlaps_df = overlaps.df

# Compute overlap length
overlaps_df["overlap_length"] = (
    overlaps_df[["End", "End_gene"]].min(axis=1)
    - overlaps_df[["Start", "Start_gene"]].max(axis=1)
)

# Keep only the gene with the maximum overlap per guide_id
best_hits = (
    overlaps_df.sort_values("overlap_length", ascending=False)
    .groupby("guide_id", as_index=False)
    .first()
)

# Keep relevant columns
best_hits = best_hits[["guide_id", "gene_id", "gene_name", "overlap_length"]]

# Update gene_name and gene_id with _gtf suffix
best_hits.rename(columns={
    "gene_name": "gene_name_gtf",
    "gene_id": "gene_id_gtf"
}, inplace=True)

# Duplicate guide_ids?
best_hits[best_hits.duplicated(subset=["guide_id"], keep=False)].sort_values("guide_id")

  return {k: v for k, v in df.groupby(grpby_key)}
  empty_removed = df.groupby(["Chromosome", "Strand"])
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.


Unnamed: 0,guide_id,gene_id_gtf,gene_name_gtf,overlap_length


In [448]:
# Merge with original df
guide_metadata = guide_metadata.merge(best_hits, on="guide_id", how="left")

In [449]:
missing_targets = guide_metadata[guide_metadata["intended_target_name"].isna()]
missing_targets["gene_name_gtf"].value_counts()

gene_name_gtf
POU5F1     12
SMARCB1    12
OR2A25      6
OR2F1       6
TCF7L1      6
Name: count, dtype: int64

In [450]:
# intended_target_name is gene_id for NaN enties
guide_metadata.loc[guide_metadata["intended_target_name"].isna(), "intended_target_name"] = guide_metadata.loc[guide_metadata["intended_target_name"].isna(), "gene_id_gtf"]

In [451]:
len(guide_metadata[guide_metadata["intended_target_name"].isna()])

0

## Add gene name

In [452]:
# Initialize MyGeneInfo
mg = MyGeneInfo()

In [453]:
# Map intended_target_name (Ensembl IDs) to gene names
out = mg.querymany(guide_metadata['intended_target_name'].tolist(), scopes='ensembl.gene', fields='symbol', species='human')
map_df = pd.DataFrame(out)[['query', 'symbol']].rename(columns={'query': 'intended_target_name', 'symbol': 'gene_name_mygene'})
guide_metadata = pd.concat([guide_metadata, map_df["gene_name_mygene"]], axis=1)
guide_metadata.head()

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
50 input query terms found dup hits:	[('ENSG00000072364', 6), ('ENSG00000117713', 6), ('ENSG00000049618', 12), ('ENSG00000143437', 6), ('
80 input query terms found no hit:	['ENSSDUG00000003529', 'ENSSDUG00000003529', 'ENSSDUG00000003529', 'ENSSDUG00000003529', 'ENSSDUG000


Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id_gtf,gene_name_gtf,overlap_length,gene_name_mygene
0,chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963540,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGGCGGGTTAACGAAGACC,ENSG00000072364.13,AFF4,94.0,AFF4
1,chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963579,-,NGG,ENSG00000072364,chr5,132963540,132963638,GTCGCCGCCGCCAGCGGACG,ENSG00000072364.13,AFF4,94.0,AFF4
2,chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963567,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGATCCCCGCCCCGTCCGC,ENSG00000072364.13,AFF4,94.0,AFF4
3,chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963620,+,NGG,ENSG00000072364,chr5,132963540,132963638,GAGGGCTGTGACTGACGCAG,ENSG00000072364.13,AFF4,94.0,AFF4
4,chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963570,+,NGG,ENSG00000072364,chr5,132963540,132963638,GTCCCCGCCCCGTCCGCTGG,ENSG00000072364.13,AFF4,94.0,AFF4


In [454]:
# How many mismatches between gtf and mygene?
mismatched_gene_name = guide_metadata[(guide_metadata["gene_name_gtf"] != guide_metadata["gene_name_mygene"]) & 
                                     guide_metadata["gene_name_gtf"].notna() & 
                                     guide_metadata["gene_name_mygene"].notna()]
mismatched_gene_name[["guide_id", "intended_target_name", "gene_name_gtf", "gene_name_mygene"]]

Unnamed: 0,guide_id,intended_target_name,gene_name_gtf,gene_name_mygene
258,chr18:51030267-51030285(+),ENSG00000141646,ENSG00000267699,SMAD4
261,chr18:51030262-51030280(+),ENSG00000141646,ENSG00000267699,SMAD4


In [455]:
# Make new gene_name column
# My gene first
# If mygene is NaN, use gtf
guide_metadata["gene_name"] = guide_metadata.apply(
    lambda x: x["gene_name_mygene"] if pd.notna(x["gene_name_mygene"]) else (x["gene_name_gtf"] if pd.notna(x["gene_name_gtf"]) else x["intended_target_name"]),
    axis=1
)

In [456]:
guide_metadata["intended_target_name"]

0      ENSG00000072364
1      ENSG00000072364
2      ENSG00000072364
3      ENSG00000072364
4      ENSG00000072364
            ...       
410      non-targeting
411      non-targeting
412      non-targeting
413      non-targeting
414      non-targeting
Name: intended_target_name, Length: 415, dtype: object

# Positive control metadata

In [457]:
# Grab the positive controls
pc_metadata = guide_metadata[guide_metadata["type"] == "positive control"]
print(f"{len(pc_metadata)} positive control guides")
pc_metadata

7 positive control guides


Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id_gtf,gene_name_gtf,overlap_length,gene_name_mygene,gene_name
378,CD81,GAGAGCCAGCGCGCAACGG,False,positive control,chrC,0,0,+,NGG,ENSG00000110651,chrPC,0,0,GGAGAGCCAGCGCGCAACGG,,,,CD81,CD81
379,CD151,CCGGACTCGGACGCGTGGT,False,positive control,chrC,0,0,+,NGG,ENSG00000177697,chrPC,0,0,GCCGGACTCGGACGCGTGGT,,,,CD151,CD151
380,CD151,CCGCTCGGCCGAGCTGTCG,False,positive control,chrC,0,0,+,NGG,ENSG00000177697,chrPC,0,0,GCCGCTCGGCCGAGCTGTCG,,,,CD151,CD151
381,CD55,CTGCGACTCGGCGGAGTCC,False,positive control,chrC,0,0,+,NGG,ENSG00000196352,chrPC,0,0,GCTGCGACTCGGCGGAGTCC,,,,CD55,CD55
382,NGFRAP1,GTTGGAGTTTGCCCTCCTC,False,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GGTTGGAGTTTGCCCTCCTC,,,,,ENSBTSG00005026991
383,NGFRAP1,AGGACCGAGAAGAGTGACA,False,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GAGGACCGAGAAGAGTGACA,,,,,ENSBTSG00005026991
384,TFRC,CTCAGAGCGTCGGGATATC,False,positive control,chrC,0,0,+,NGG,ENSG00000072274,chrPC,0,0,GCTCAGAGCGTCGGGATATC,,,,TFRC,TFRC


In [458]:
pc_metadata[pc_metadata.columns[:8]]

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand
378,CD81,GAGAGCCAGCGCGCAACGG,False,positive control,chrC,0,0,+
379,CD151,CCGGACTCGGACGCGTGGT,False,positive control,chrC,0,0,+
380,CD151,CCGCTCGGCCGAGCTGTCG,False,positive control,chrC,0,0,+
381,CD55,CTGCGACTCGGCGGAGTCC,False,positive control,chrC,0,0,+
382,NGFRAP1,GTTGGAGTTTGCCCTCCTC,False,positive control,chrC,0,0,+
383,NGFRAP1,AGGACCGAGAAGAGTGACA,False,positive control,chrC,0,0,+
384,TFRC,CTCAGAGCGTCGGGATATC,False,positive control,chrC,0,0,+


In [459]:
pc_metadata["in_ref"] = pc_metadata["Photospacer"].isin(ref_pcs["Photospacer"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pc_metadata["in_ref"] = pc_metadata["Photospacer"].isin(ref_pcs["Photospacer"])


In [460]:
pc_metadata[~pc_metadata["in_ref"]]

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id_gtf,gene_name_gtf,overlap_length,gene_name_mygene,gene_name,in_ref
382,NGFRAP1,GTTGGAGTTTGCCCTCCTC,False,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GGTTGGAGTTTGCCCTCCTC,,,,,ENSBTSG00005026991,False


In [461]:
# Add reference guide_id
pc_metadata = pc_metadata.merge(ref_pcs[["Photospacer", "guide_id"]], on="Photospacer", how="left", suffixes=("", "_ref"))
pc_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,...,intended_target_start,intended_target_end,Photospacer,gene_id_gtf,gene_name_gtf,overlap_length,gene_name_mygene,gene_name,in_ref,guide_id_ref
0,CD81,GAGAGCCAGCGCGCAACGG,False,positive control,chrC,0,0,+,NGG,ENSG00000110651,...,0,0,GGAGAGCCAGCGCGCAACGG,,,,CD81,CD81,True,CD81#weak
1,CD151,CCGGACTCGGACGCGTGGT,False,positive control,chrC,0,0,+,NGG,ENSG00000177697,...,0,0,GCCGGACTCGGACGCGTGGT,,,,CD151,CD151,True,CD151#strong
2,CD151,CCGCTCGGCCGAGCTGTCG,False,positive control,chrC,0,0,+,NGG,ENSG00000177697,...,0,0,GCCGCTCGGCCGAGCTGTCG,,,,CD151,CD151,True,CD151#weak
3,CD55,CTGCGACTCGGCGGAGTCC,False,positive control,chrC,0,0,+,NGG,ENSG00000196352,...,0,0,GCTGCGACTCGGCGGAGTCC,,,,CD55,CD55,True,CD55#strong
4,NGFRAP1,GTTGGAGTTTGCCCTCCTC,False,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,...,0,0,GGTTGGAGTTTGCCCTCCTC,,,,,ENSBTSG00005026991,False,
5,NGFRAP1,AGGACCGAGAAGAGTGACA,False,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,...,0,0,GAGGACCGAGAAGAGTGACA,,,,,ENSBTSG00005026991,True,NGFRAP1#A
6,TFRC,CTCAGAGCGTCGGGATATC,False,positive control,chrC,0,0,+,NGG,ENSG00000072274,...,0,0,GCTCAGAGCGTCGGGATATC,,,,TFRC,TFRC,True,TFRC#A


In [462]:
# Make new_guide_id column
# for NGFRAP1	GTTGGAGTTTGCCCTCCTC make this NGFRAP1#B
pc_metadata["new_guide_id"] = pc_metadata.apply(
    lambda x: x["guide_id_ref"] if pd.notna(x["guide_id_ref"]) else f"{x['guide_id']}#B", axis=1
)

# Write out as fasta file
path_out_pc_fasta = "/cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_10_05/positive_control_guides.fasta"
with open(path_out_pc_fasta, "w") as f:
    for _, row in pc_metadata.iterrows():
        if row["new_guide_id"] == "CD81#weak":
            f.write(f">{row['new_guide_id']}\n{ref_pcs[ref_pcs['guide_id'] == 'CD81#strong']['Photospacer'].values[0][1:]}NGG\n")
        elif row["new_guide_id"] == "CD151#weak":
            f.write(f">{row['new_guide_id']}\n{ref_pcs[ref_pcs['guide_id'] == 'CD151#strong']['Photospacer'].values[0][1:]}NGG\n")
        else:
            f.write(f">{row['new_guide_id']}\n{row['spacer']}NGG\n")
print(f"Wrote positive control guides to {path_out_pc_fasta}")

Wrote positive control guides to /cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_10_05/positive_control_guides.fasta


In [463]:
blat_res = pd.read_csv("/cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_10_05/2025_10_05_BLAT_res.tsv", sep="\t")
blat_res["tStart_adj"] = np.where(blat_res["strand"] == "-", blat_res["tStart"] + 3, blat_res["tStart"])
blat_res["tEnd_adj"] = np.where(blat_res["strand"] == "+", blat_res["tEnd"] - 3, blat_res["tEnd"])
blat_res["match_length"] = blat_res["tEnd_adj"] - blat_res["tStart_adj"] + 1
blat_res["tStrand"] = blat_res["strand"]
blat_res

Unnamed: 0,query,score,qStart,qEnd,qSize,identity,chrom,strand,tStart,tEnd,span,tStart_adj,tEnd_adj,match_length,tStrand
0,CD151#strong,21,1,22,22,100.0%,chr11,+,833006,833027,22,833006,833024,19,+
1,CD151#weak,21,1,22,22,100.0%,chr11,+,833006,833027,22,833006,833024,19,+
2,CD55#strong,21,1,22,22,100.0%,chr1,+,207321714,207321735,22,207321714,207321732,19,+
3,CD81#weak,21,1,22,22,100.0%,chr11,+,2377315,2377336,22,2377315,2377333,19,+
4,NGFRAP1#A,21,1,22,22,100.0%,chrX,+,103376320,103376341,22,103376320,103376338,19,+
5,NGFRAP1#B,21,1,22,22,100.0%,chrX,-,103376258,103376279,22,103376261,103376279,19,-
6,TFRC#A,21,1,22,22,100.0%,chr3,-,196082072,196082093,22,196082075,196082093,19,-


In [464]:
# Add in guide_chr, guide_start, guide_end from blat res to pc_metadata
pc_metadata = pc_metadata.merge(
    blat_res[["query", "chrom", "tStart", "tEnd_adj", "tStrand"]],
    left_on="new_guide_id",
    right_on="query",
    how="left"
).rename(columns={
    "chrom": "guide_chr_blat",
    "tStart": "guide_start_blat",
    "tEnd_adj": "guide_end_blat",
    "tStrand": "strand_blat"
}).drop(columns=["query"])
pc_metadata.head()

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,...,overlap_length,gene_name_mygene,gene_name,in_ref,guide_id_ref,new_guide_id,guide_chr_blat,guide_start_blat,guide_end_blat,strand_blat
0,CD81,GAGAGCCAGCGCGCAACGG,False,positive control,chrC,0,0,+,NGG,ENSG00000110651,...,,CD81,CD81,True,CD81#weak,CD81#weak,chr11,2377315,2377333,+
1,CD151,CCGGACTCGGACGCGTGGT,False,positive control,chrC,0,0,+,NGG,ENSG00000177697,...,,CD151,CD151,True,CD151#strong,CD151#strong,chr11,833006,833024,+
2,CD151,CCGCTCGGCCGAGCTGTCG,False,positive control,chrC,0,0,+,NGG,ENSG00000177697,...,,CD151,CD151,True,CD151#weak,CD151#weak,chr11,833006,833024,+
3,CD55,CTGCGACTCGGCGGAGTCC,False,positive control,chrC,0,0,+,NGG,ENSG00000196352,...,,CD55,CD55,True,CD55#strong,CD55#strong,chr1,207321714,207321732,+
4,NGFRAP1,GTTGGAGTTTGCCCTCCTC,False,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,...,,,ENSBTSG00005026991,False,,NGFRAP1#B,chrX,103376258,103376279,-


In [465]:
# Clean up
pc_metadata["targeting"] = True
pc_metadata["type"] = "targeting"
pc_metadata["label"] = "positive_control"
pc_metadata["intended_target_start"] = pc_metadata["guide_start_blat"]
pc_metadata["intended_target_end"] = pc_metadata["guide_end_blat"]
pc_metadata.loc[pc_metadata["gene_name"] == "ENSBTSG00005026991", "gene_name"] = "NGFRAP1"
pc_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,...,gene_name_mygene,gene_name,in_ref,guide_id_ref,new_guide_id,guide_chr_blat,guide_start_blat,guide_end_blat,strand_blat,label
0,CD81,GAGAGCCAGCGCGCAACGG,True,targeting,chrC,0,0,+,NGG,ENSG00000110651,...,CD81,CD81,True,CD81#weak,CD81#weak,chr11,2377315,2377333,+,positive_control
1,CD151,CCGGACTCGGACGCGTGGT,True,targeting,chrC,0,0,+,NGG,ENSG00000177697,...,CD151,CD151,True,CD151#strong,CD151#strong,chr11,833006,833024,+,positive_control
2,CD151,CCGCTCGGCCGAGCTGTCG,True,targeting,chrC,0,0,+,NGG,ENSG00000177697,...,CD151,CD151,True,CD151#weak,CD151#weak,chr11,833006,833024,+,positive_control
3,CD55,CTGCGACTCGGCGGAGTCC,True,targeting,chrC,0,0,+,NGG,ENSG00000196352,...,CD55,CD55,True,CD55#strong,CD55#strong,chr1,207321714,207321732,+,positive_control
4,NGFRAP1,GTTGGAGTTTGCCCTCCTC,True,targeting,chrC,0,0,+,NGG,ENSBTSG00005026991,...,,NGFRAP1,False,,NGFRAP1#B,chrX,103376258,103376279,-,positive_control
5,NGFRAP1,AGGACCGAGAAGAGTGACA,True,targeting,chrC,0,0,+,NGG,ENSBTSG00005026991,...,,NGFRAP1,True,NGFRAP1#A,NGFRAP1#A,chrX,103376320,103376338,+,positive_control
6,TFRC,CTCAGAGCGTCGGGATATC,True,targeting,chrC,0,0,+,NGG,ENSG00000072274,...,TFRC,TFRC,True,TFRC#A,TFRC#A,chr3,196082072,196082093,-,positive_control


In [467]:
pc_clean_metadata = pc_metadata[["new_guide_id", "spacer", "targeting", "type", 
             "guide_chr_blat", "guide_start_blat", "guide_end_blat", "strand_blat", "PAM", 
             "intended_target_name", "intended_target_start", "intended_target_end", 
             "gene_name", "label"]].rename(columns={"new_guide_id": "guide_id", "guide_chr_blat": "guide_chr",
                                                  "guide_start_blat": "guide_start", "guide_end_blat": "guide_end",
                                                  "strand_blat": "strand"})
pc_clean_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_start,intended_target_end,gene_name,label
0,CD81#weak,GAGAGCCAGCGCGCAACGG,True,targeting,chr11,2377315,2377333,+,NGG,ENSG00000110651,2377315,2377333,CD81,positive_control
1,CD151#strong,CCGGACTCGGACGCGTGGT,True,targeting,chr11,833006,833024,+,NGG,ENSG00000177697,833006,833024,CD151,positive_control
2,CD151#weak,CCGCTCGGCCGAGCTGTCG,True,targeting,chr11,833006,833024,+,NGG,ENSG00000177697,833006,833024,CD151,positive_control
3,CD55#strong,CTGCGACTCGGCGGAGTCC,True,targeting,chr1,207321714,207321732,+,NGG,ENSG00000196352,207321714,207321732,CD55,positive_control
4,NGFRAP1#B,GTTGGAGTTTGCCCTCCTC,True,targeting,chrX,103376258,103376279,-,NGG,ENSBTSG00005026991,103376258,103376279,NGFRAP1,positive_control
5,NGFRAP1#A,AGGACCGAGAAGAGTGACA,True,targeting,chrX,103376320,103376338,+,NGG,ENSBTSG00005026991,103376320,103376338,NGFRAP1,positive_control
6,TFRC#A,CTCAGAGCGTCGGGATATC,True,targeting,chr3,196082072,196082093,-,NGG,ENSG00000072274,196082072,196082093,TFRC,positive_control


# Non-targeting control metadata

In [468]:
# Grab the non-targeting controls
nt_metadata = guide_metadata[guide_metadata["type"] == "non-targeting"]
nt_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id_gtf,gene_name_gtf,overlap_length,gene_name_mygene,gene_name
385,non-targeting_00642,GAGTTAAGGCCTCGTCTAG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGAGTTAAGGCCTCGTCTAG,,,,,non-targeting
386,non-targeting_00718,TCCCAGGCTCTCCACTATG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTCCCAGGCTCTCCACTATG,,,,,non-targeting
387,non-targeting_03631,GACGCGTCTGCAAGAACGT,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGACGCGTCTGCAAGAACGT,,,,,non-targeting
388,non-targeting_03705,GGCATGGACCCGCGGCACG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGGCATGGACCCGCGGCACG,,,,,non-targeting
389,non-targeting_01469,CGTCCGAGGTACTGAATAA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GCGTCCGAGGTACTGAATAA,,,,,non-targeting
390,non-targeting_02459,TCTGACTCTCCGTCCACCA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTCTGACTCTCCGTCCACCA,,,,,non-targeting
391,non-targeting_00166,AGCCCCGCCTGGGTACGCG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GAGCCCCGCCTGGGTACGCG,,,,,non-targeting
392,non-targeting_01967,GGCACCGCCGTTAAAAGTT,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGGCACCGCCGTTAAAAGTT,,,,,non-targeting
393,non-targeting_02577,CTACAACCGGCGAGCGATA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GCTACAACCGGCGAGCGATA,,,,,non-targeting
394,non-targeting_00954,TAACTGCTACGGCGCCCAA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTAACTGCTACGGCGCCCAA,,,,,non-targeting


In [469]:
# 
len(nt_metadata)

30

In [470]:
# set guide_chr	guide_start	guide_end	strand	PAM to NaN
nt_metadata["guide_chr"] = np.nan
nt_metadata["guide_start"] = np.nan
nt_metadata["guide_end"] = np.nan
nt_metadata["strand"] = np.nan
nt_metadata["PAM"] = np.nan
nt_metadata["intended_target_name"] = np.nan
nt_metadata["intended_target_start"] = np.nan
nt_metadata["intended_target_end"] = np.nan
nt_metadata["gene"] = np.nan
nt_metadata["label"] = "non_targeting"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nt_metadata["guide_chr"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nt_metadata["guide_start"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nt_metadata["guide_end"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [471]:
nt_clean_metadata = nt_metadata[["guide_id", "spacer", "targeting", "type", "guide_start", "guide_end", "guide_chr", "strand", "PAM",
             "intended_target_name", "intended_target_start", "intended_target_end",
             "gene_name", "label"]]
nt_clean_metadata.head()

Unnamed: 0,guide_id,spacer,targeting,type,guide_start,guide_end,guide_chr,strand,PAM,intended_target_name,intended_target_start,intended_target_end,gene_name,label
385,non-targeting_00642,GAGTTAAGGCCTCGTCTAG,False,non-targeting,,,,,,,,,non-targeting,non_targeting
386,non-targeting_00718,TCCCAGGCTCTCCACTATG,False,non-targeting,,,,,,,,,non-targeting,non_targeting
387,non-targeting_03631,GACGCGTCTGCAAGAACGT,False,non-targeting,,,,,,,,,non-targeting,non_targeting
388,non-targeting_03705,GGCATGGACCCGCGGCACG,False,non-targeting,,,,,,,,,non-targeting,non_targeting
389,non-targeting_01469,CGTCCGAGGTACTGAATAA,False,non-targeting,,,,,,,,,non-targeting,non_targeting


# Inspect targeting metadata

In [472]:
# Grab only the guides targeting targetings
targeting_metadata = guide_metadata[guide_metadata["targeting"] == True]

# Check Photospacer column against all spacers in ref_targeting_long
targeting_metadata["in_tf_ref"] = targeting_metadata["Photospacer"].isin(ref_tf_long["Photospacer"])
targeting_metadata["in_nc_ref"] = targeting_metadata["Photospacer"].isin(ref_nc_all["Photospacer"])

# OR genes
targeting_metadata["is_OR_gene"] = (targeting_metadata["gene_name_mygene"].replace(np.nan, "NA").str.startswith("OR")) | (targeting_metadata["gene_name_gtf"].replace(np.nan, "NA").str.startswith("OR")) 

# Check
targeting_metadata[targeting_metadata.columns[:8]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targeting_metadata["in_tf_ref"] = targeting_metadata["Photospacer"].isin(ref_tf_long["Photospacer"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targeting_metadata["in_nc_ref"] = targeting_metadata["Photospacer"].isin(ref_nc_all["Photospacer"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targ

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand
0,chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963540,+
1,chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963579,-
2,chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963567,+
3,chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963620,+
4,chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963570,+
...,...,...,...,...,...,...,...,...
373,chr2:121284700-121284718(+),CGATACGCGGGGGAGAGGA,True,targeting,chr2,121284700,121284700,+
374,chr2:121285141-121285159(+),CGGCAGCAAGCGCAGACGC,True,targeting,chr2,121285141,121285141,+
375,chr2:121284796-121284814(+),AGGAGCCCTCGCAGACATA,True,targeting,chr2,121284796,121284796,+
376,chr2:121285161-121285179(+),GGGCGCGCCGAGGACCCAG,True,targeting,chr2,121285161,121285161,+


In [473]:
# How many
targeting_metadata["in_nc_ref"].sum(), targeting_metadata["in_tf_ref"].sum()

(29, 288)

In [474]:
pd.crosstab(targeting_metadata["in_nc_ref"], targeting_metadata["is_OR_gene"])

is_OR_gene,False,True
in_nc_ref,Unnamed: 1_level_1,Unnamed: 2_level_1
False,324,25
True,0,29


In [475]:
# Clean
targeting_metadata["new_guide_id"] = targeting_metadata["gene_name"] + "#" + targeting_metadata["guide_id"]
targeting_metadata["guide_end"] = targeting_metadata["guide_id"].str.split("-").str[1].str.split("(").str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targeting_metadata["new_guide_id"] = targeting_metadata["gene_name"] + "#" + targeting_metadata["guide_id"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targeting_metadata["guide_end"] = targeting_metadata["guide_id"].str.split("-").str[1].str.split("(").str[0]


## Negative control targets

In [476]:
nc_metadata = targeting_metadata[targeting_metadata["is_OR_gene"] == True]
len(nc_metadata)

54

In [477]:
nc_metadata["in_tf_ref"].sum(), nc_metadata["in_nc_ref"].sum()

(0, 29)

In [478]:
nc_metadata["gene_name"].value_counts()

gene_name
OR10J3    6
OR2A25    6
OR2AG2    6
OR2D3     6
OR2F1     6
OR4D6     6
OR4X1     6
OR6A2     6
OR6T1     6
Name: count, dtype: int64

In [479]:
nc_metadata["label"] = "negative_control"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nc_metadata["label"] = "negative_control"


In [480]:
nc_clean_metadata = nc_metadata[["new_guide_id", "spacer", "targeting", "type",
             "guide_chr", "guide_start", "guide_end", "strand", "PAM",
             "intended_target_name", "intended_target_start", "intended_target_end",
             "gene_name", "label"]].rename(columns={"new_guide_id": "guide_id"})
nc_clean_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_start,intended_target_end,gene_name,label
156,OR10J3#chr1:159314543-159314561(+),AGTCAGCAGGTACAAAGTT,True,targeting,chr1,159314543,159314561,+,NGG,ENSG00000291901,159314202,159314653,OR10J3,negative_control
157,OR10J3#chr1:159314622-159314640(+),AGGAACTCAGTCACAAAAG,True,targeting,chr1,159314622,159314640,+,NGG,ENSG00000291901,159314202,159314653,OR10J3,negative_control
158,OR10J3#chr1:159314586-159314604(+),TTGTGCTGCCGCCTGAAGC,True,targeting,chr1,159314586,159314604,+,NGG,ENSG00000291901,159314202,159314653,OR10J3,negative_control
159,OR10J3#chr1:159314450-159314468(+),AGAGATGGATAGCATGCAC,True,targeting,chr1,159314450,159314468,+,NGG,ENSG00000291901,159314202,159314653,OR10J3,negative_control
160,OR10J3#chr1:159314635-159314653(+),CAAAAGTGGAATTTAGCTT,True,targeting,chr1,159314635,159314653,+,NGG,ENSG00000291901,159314202,159314653,OR10J3,negative_control
161,OR10J3#chr1:159314202-159314220(+),CCAATCCCCAGTGATCCAG,True,targeting,chr1,159314202,159314220,+,NGG,ENSG00000291901,159314202,159314653,OR10J3,negative_control
162,OR2A25#chr7:144074196-144074214(+),GCTCCTTGTTTCTTCTACA,True,targeting,chr7,144074196,144074214,+,NGG,ENSG00000221933.3,144074196,144074622,OR2A25,negative_control
163,OR2A25#chr7:144074203-144074221(+),GTTTCTTCTACAGGGAAAT,True,targeting,chr7,144074203,144074221,+,NGG,ENSG00000221933.3,144074196,144074622,OR2A25,negative_control
164,OR2A25#chr7:144074434-144074452(-),GGCACCGTGCTGCAAGCAC,True,targeting,chr7,144074434,144074452,-,NGG,ENSG00000221933.3,144074196,144074622,OR2A25,negative_control
165,OR2A25#chr7:144074335-144074353(+),TGGGGAACGGGACAATCCT,True,targeting,chr7,144074335,144074353,+,NGG,ENSG00000221933.3,144074196,144074622,OR2A25,negative_control


## TF targets

In [481]:
tf_metadata = targeting_metadata[targeting_metadata["is_OR_gene"] == False]
tf_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,...,Photospacer,gene_id_gtf,gene_name_gtf,overlap_length,gene_name_mygene,gene_name,in_tf_ref,in_nc_ref,is_OR_gene,new_guide_id
0,chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963558,+,NGG,ENSG00000072364,...,GGGGCGGGTTAACGAAGACC,ENSG00000072364.13,AFF4,94.0,AFF4,AFF4,True,False,False,AFF4#chr5:132963540-132963558(+)
1,chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963597,-,NGG,ENSG00000072364,...,GTCGCCGCCGCCAGCGGACG,ENSG00000072364.13,AFF4,94.0,AFF4,AFF4,True,False,False,AFF4#chr5:132963579-132963597(-)
2,chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963585,+,NGG,ENSG00000072364,...,GGGATCCCCGCCCCGTCCGC,ENSG00000072364.13,AFF4,94.0,AFF4,AFF4,True,False,False,AFF4#chr5:132963567-132963585(+)
3,chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963638,+,NGG,ENSG00000072364,...,GAGGGCTGTGACTGACGCAG,ENSG00000072364.13,AFF4,94.0,AFF4,AFF4,True,False,False,AFF4#chr5:132963620-132963638(+)
4,chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963588,+,NGG,ENSG00000072364,...,GTCCCCGCCCCGTCCGCTGG,ENSG00000072364.13,AFF4,94.0,AFF4,AFF4,True,False,False,AFF4#chr5:132963570-132963588(+)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,chr2:121284700-121284718(+),CGATACGCGGGGGAGAGGA,True,targeting,chr2,121284700,121284718,+,NGG,ENSG00000115112,...,GCGATACGCGGGGGAGAGGA,ENSG00000115112.8,TFCP2L1,479.0,TFCP2L1,TFCP2L1,True,False,False,TFCP2L1#chr2:121284700-121284718(+)
374,chr2:121285141-121285159(+),CGGCAGCAAGCGCAGACGC,True,targeting,chr2,121285141,121285159,+,NGG,ENSG00000115112,...,GCGGCAGCAAGCGCAGACGC,ENSG00000115112.8,TFCP2L1,479.0,TFCP2L1,TFCP2L1,True,False,False,TFCP2L1#chr2:121285141-121285159(+)
375,chr2:121284796-121284814(+),AGGAGCCCTCGCAGACATA,True,targeting,chr2,121284796,121284814,+,NGG,ENSG00000115112,...,GAGGAGCCCTCGCAGACATA,ENSG00000115112.8,TFCP2L1,479.0,TFCP2L1,TFCP2L1,True,False,False,TFCP2L1#chr2:121284796-121284814(+)
376,chr2:121285161-121285179(+),GGGCGCGCCGAGGACCCAG,True,targeting,chr2,121285161,121285179,+,NGG,ENSG00000115112,...,GGGGCGCGCCGAGGACCCAG,ENSG00000115112.8,TFCP2L1,479.0,TFCP2L1,TFCP2L1,True,False,False,TFCP2L1#chr2:121285161-121285179(+)


In [482]:
tf_metadata["in_tf_ref"].sum(), tf_metadata["in_nc_ref"].sum()

(288, 0)

In [483]:
tf_metadata["gene_name"].value_counts()

gene_name
POU5F1     12
SMARCB1    12
ARID1B     12
TCF12      12
LEF1       12
SMARCD3    12
KLF4       12
REST       12
RUNX1       6
SALL4       6
SMAD3       6
SMAD4       6
SMARCA2     6
SMARCA4     6
AFF4        6
SMARCC1     6
SMARCC2     6
SMARCD2     6
SOX2        6
SUZ12       6
TBX3        6
TCF7L1      6
TCF7L2      6
TCF7        6
SMARCD1     6
PAX5        6
ARID1A      6
NFIB        6
BMAL1       6
ARNT        6
BATF        6
CLOCK       6
CREBBP      6
CTNNB1      6
EED         6
EP300       6
ESRRB       6
EZH2        6
HDAC1       6
HIF1A       6
HMGA2       6
JARID2      6
KLF6        6
MYC         6
NANOG       6
TFCP2L1     6
Name: count, dtype: int64

In [484]:
len(tf_metadata["gene_name"].unique())

46

In [485]:
tf_metadata["label"] = "tf_targeting"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tf_metadata["label"] = "tf_targeting"


In [486]:
tf_clean_metadata = tf_metadata[["new_guide_id", "spacer", "targeting", "type",
             "guide_chr", "guide_start", "guide_end", "strand", "PAM",
             "intended_target_name", "intended_target_start", "intended_target_end",
             "gene_name", "label"]].rename(columns={"new_guide_id": "guide_id"})
tf_clean_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_start,intended_target_end,gene_name,label
0,AFF4#chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963558,+,NGG,ENSG00000072364,132963540,132963638,AFF4,tf_targeting
1,AFF4#chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963597,-,NGG,ENSG00000072364,132963540,132963638,AFF4,tf_targeting
2,AFF4#chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963585,+,NGG,ENSG00000072364,132963540,132963638,AFF4,tf_targeting
3,AFF4#chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963638,+,NGG,ENSG00000072364,132963540,132963638,AFF4,tf_targeting
4,AFF4#chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963588,+,NGG,ENSG00000072364,132963540,132963638,AFF4,tf_targeting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,TFCP2L1#chr2:121284700-121284718(+),CGATACGCGGGGGAGAGGA,True,targeting,chr2,121284700,121284718,+,NGG,ENSG00000115112,121284700,121285179,TFCP2L1,tf_targeting
374,TFCP2L1#chr2:121285141-121285159(+),CGGCAGCAAGCGCAGACGC,True,targeting,chr2,121285141,121285159,+,NGG,ENSG00000115112,121284700,121285179,TFCP2L1,tf_targeting
375,TFCP2L1#chr2:121284796-121284814(+),AGGAGCCCTCGCAGACATA,True,targeting,chr2,121284796,121284814,+,NGG,ENSG00000115112,121284700,121285179,TFCP2L1,tf_targeting
376,TFCP2L1#chr2:121285161-121285179(+),GGGCGCGCCGAGGACCCAG,True,targeting,chr2,121285161,121285179,+,NGG,ENSG00000115112,121284700,121285179,TFCP2L1,tf_targeting


## Spacers that I couldn't find in ref

In [487]:
# Find any spacers that do not match
mismatched_spacers = targeting_metadata[(~targeting_metadata["in_tf_ref"]) & (~targeting_metadata["in_nc_ref"])]
mismatched_spacers[["intended_target_name", "gene_name_gtf", "gene_name_mygene", "Photospacer"]]

Unnamed: 0,intended_target_name,gene_name_gtf,gene_name_mygene,Photospacer
60,ENSG00000074266,EED,EED,GCGAAGGAACGGGCCAATTG
61,ENSG00000074266,EED,EED,GGGGTCGGAGATCGAAGGAA
62,ENSG00000074266,EED,EED,GCTGAAACGTCTTTGGAAGG
63,ENSG00000074266,EED,EED,GGCAGCGGGTCGGAGATCGA
64,ENSG00000074266,EED,EED,GGTCTTTGGAAGGAGGAAGG
...,...,...,...,...
319,ENSG00000082014,SMARCD3,SMARCD3,GGAGTTTCTGGTCCATGGGG
320,ENSG00000082014,SMARCD3,SMARCD3,GGAGCCCAGCAGGACTCAGA
321,ENSG00000082014,SMARCD3,SMARCD3,GTCCATGGGGTGGTGAGTGG
322,ENSG00000082014,SMARCD3,SMARCD3,GCTCCCCTCTGAGTCCTGCT


In [488]:
mismatched_gene_name["intended_target_name"].isna().sum()

0

In [489]:
mismatched_spacers["gene_name_mygene"].value_counts()

gene_name_mygene
SMARCD3    12
EED         6
JARID2      6
SMARCD1     6
SMARCD2     6
OR2D3       5
OR4D6       5
OR4X1       5
OR6T1       5
Name: count, dtype: int64

In [490]:
mismatched_spacers[mismatched_spacers["gene_name_mygene"] == "OR4X1"]

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,...,Photospacer,gene_id_gtf,gene_name_gtf,overlap_length,gene_name_mygene,gene_name,in_tf_ref,in_nc_ref,is_OR_gene,new_guide_id
192,chr11:48263880-48263898(+),TGACTGAAATAATTTTCGT,True,targeting,chr11,48263880,48263898,+,NGG,ENSG00000176567,...,GTGACTGAAATAATTTTCGT,ENSG00000176567.1,OR4X1,424.0,OR4X1,OR4X1,False,False,True,OR4X1#chr11:48263880-48263898(+)
193,chr11:48264090-48264108(-),GAGTCAAAGATAAGCTTGG,True,targeting,chr11,48264090,48264108,-,NGG,ENSG00000176567,...,GGAGTCAAAGATAAGCTTGG,ENSG00000176567.1,OR4X1,424.0,OR4X1,OR4X1,False,False,True,OR4X1#chr11:48264090-48264108(-)
194,chr11:48263953-48263971(-),CACAACAGCTGTGTACATG,True,targeting,chr11,48263953,48263971,-,NGG,ENSG00000176567,...,GCACAACAGCTGTGTACATG,ENSG00000176567.1,OR4X1,424.0,OR4X1,OR4X1,False,False,True,OR4X1#chr11:48263953-48263971(-)
195,chr11:48264286-48264304(-),GCCCCATGCTATCCTCACG,True,targeting,chr11,48264286,48264304,-,NGG,ENSG00000176567,...,GGCCCCATGCTATCCTCACG,ENSG00000176567.1,OR4X1,424.0,OR4X1,OR4X1,False,False,True,OR4X1#chr11:48264286-48264304(-)
197,chr11:48264281-48264299(+),CTCCTCGTGAGGATAGCAT,True,targeting,chr11,48264281,48264299,+,NGG,ENSG00000176567,...,GCTCCTCGTGAGGATAGCAT,ENSG00000176567.1,OR4X1,424.0,OR4X1,OR4X1,False,False,True,OR4X1#chr11:48264281-48264299(+)


# Make a final metadata table

In [500]:
print(f"Positive controls: {len(pc_clean_metadata)}, Non-targeting controls: {len(nt_clean_metadata)}, Negative controls: {len(nc_clean_metadata)}, Targeting guides: {len(tf_clean_metadata)}")
print(f"Total: {len(pc_clean_metadata) + len(nt_clean_metadata) + len(nc_clean_metadata) + len(tf_clean_metadata)}")

Positive controls: 7, Non-targeting controls: 30, Negative controls: 54, Targeting guides: 324
Total: 415


In [501]:
# Make final metadata
final_metadata = pd.concat([pc_clean_metadata, nt_clean_metadata, nc_clean_metadata, tf_clean_metadata], ignore_index=True)

# Make sure all coordinates are integers or NaN
cols = ["guide_start", "guide_end", "intended_target_start", "intended_target_end"]
for c in cols:
    final_metadata[c] = (
        pd.to_numeric(final_metadata[c], errors="coerce")  # convert to numeric, set invalid to NaN
        .astype("Int64")                                   # use pandas nullable int type
    )

In [502]:
# If intendend_target_name has .suffix, remove the suffix
final_metadata["intended_target_name"] = final_metadata["intended_target_name"].str.split(".").str[0]

In [503]:
final_metadata.to_csv("/cellar/users/aklie/data/datasets/tf_perturb_seq/scratch/2025_10_05/benchmark_guide_metadata.tsv", sep="\t", index=False)

# DONE!

---

In [None]:
path_ref_nc_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/negative_controls.tsv"
spacer_cols = ['Photospacer 1', 'Photospacer 2', 'Photospacer 3', 'Photospacer 4', 'Photospacer 5', 'Photospacer 6']
ref_nc = pd.read_csv(path_ref_nc_metadata, sep="\t")
for col in spacer_cols:
    ref_nc[col] = ref_nc[col].str.upper()

# Melt the dataframe into long format
ref_nc_long = ref_nc.melt(
    id_vars=["Gene"],
    value_vars=[col for col in ref_nc.columns if col.startswith("Photospacer")],
    var_name="Photospacer_label",
    value_name="Photospacer"
)

# Extract the numeric part of the photospacer label
ref_nc_long["Photospacer_number"] = (
    ref_nc_long["Photospacer_label"]
    .str.extract(r"(\d+)").astype(int)
)

# Create guide_id
ref_nc_long["guide_id"] = ref_nc_long.apply(lambda x: f"{x['Gene']}#Set_A#Photospacer_{x['Photospacer_number']}", axis=1)
ref_nc_long = ref_nc_long[["Gene", "Photospacer", "Photospacer_number", "guide_id"]]
ref_nc_long.head()