# Set-up

In [12]:
import pandas as pd
from mygene import MyGeneInfo
import re
import requests
import pyranges as pr

In [None]:
# Example — change this to the full URL of the GTF file
url = "https://api.data.igvf.org/reference-files/IGVFFI9573KOZR/@@download/IGVFFI9573KOZR.gtf.gz"
out_path = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/IGVFFI9573KOZR.gtf.gz"

# Stream download to avoid memory issues
with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(out_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

print("Downloaded to", out_path)

In [None]:
gtf_pr = pr.read_gtf(out_path)

  return {k: v for k, v in df.groupby(grpby_key)}


In [None]:
# Filter to just gene-level annotations
genes_pr = gtf_pr[gtf_pr.Feature == "gene"]
genes_pr

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_support_level,havana_transcript,exon_number,exon_id,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
0,chr1,HAVANA,gene,11868,14409,.,+,.,ENSG00000290825.1,lncRNA,...,,,,,,,,,,
1,chr1,HAVANA,gene,12009,13670,.,+,.,ENSG00000223972.6,transcribed_unprocessed_pseudogene,...,,,,,HGNC:37102,OTTHUMG00000000961.2,,,,
2,chr1,HAVANA,gene,29553,31109,.,+,.,ENSG00000243485.5,lncRNA,...,,,,,HGNC:52482,OTTHUMG00000000959.2,,,,
3,chr1,ENSEMBL,gene,30365,30503,.,+,.,ENSG00000284332.1,miRNA,...,,,,,HGNC:35294,,,,,
4,chr1,HAVANA,gene,52472,53312,.,+,.,ENSG00000268020.3,unprocessed_pseudogene,...,,,,,HGNC:14822,OTTHUMG00000185779.1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67566,chrY,HAVANA,gene,57015104,57016096,.,-,.,ENSG00000237801.6_PAR_Y,processed_pseudogene,...,,,,,HGNC:460,OTTHUMG00000022673.2,,,,
67567,chrY,HAVANA,gene,57165511,57165845,.,-,.,ENSG00000228410.6_PAR_Y,processed_pseudogene,...,,,,,HGNC:38160,OTTHUMG00000040493.1,,,,
67568,chrY,HAVANA,gene,57171889,57172769,.,-,.,ENSG00000223484.7_PAR_Y,processed_pseudogene,...,,,,,HGNC:23270,OTTHUMG00000022681.1,,,,
67569,chrY,HAVANA,gene,57201142,57203357,.,-,.,ENSG00000185203.12_PAR_Y,lncRNA,...,,,,,HGNC:38513,OTTHUMG00000022676.3,,,,


# Load reference metadata

## Reference TF guides

In [3]:
path_ref_tf_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/target_genes.tsv"
spacer_cols = ['Set A, Photospacer 1', 'Set A, Photospacer 2', 'Set B, Photospacer 3', 'Set B, Photospacer 4', 'Set C, Photospacer 5', 'Set C, Photospacer 6']
ref_tf = pd.read_csv(path_ref_tf_metadata, sep="\t")
ref_tf["Gene"] = ref_tf["Gene"].ffill()
for col in spacer_cols:
    ref_tf[col] = ref_tf[col].str.upper()

# Define mapping between sets, promoter columns, and photospacer columns
set_map = {
    "A": ("Set A, promoter ID", ["Set A, Photospacer 1", "Set A, Photospacer 2"]),
    "B": ("Set B, promoter ID", ["Set B, Photospacer 3", "Set B, Photospacer 4"]),
    "C": ("Set C, promoter ID", ["Set C, Photospacer 5", "Set C, Photospacer 6"]),
}

# Collect rows
rows = []
for _, row in ref_tf.iterrows():
    gene = row["Gene"]
    for set_name, (prom_col, spacer_cols) in set_map.items():
        promoter_id = row[prom_col]
        for spacer_col in spacer_cols:
            spacer_seq = row[spacer_col]
            if pd.notna(spacer_seq):
                # Extract photospacer number from column name
                match = re.search(r"Photospacer (\d+)", spacer_col)
                spacer_num = int(match.group(1)) if match else None
                rows.append({
                    "Gene": gene,
                    "promoter_ID": promoter_id,
                    "Set": set_name,
                    "Photospacer": spacer_seq,
                    "Photospacer_number": spacer_num,
                })

# Create the long df
ref_tf_long = pd.DataFrame(rows)[["Gene", "promoter_ID", "Set", "Photospacer", "Photospacer_number"]]

# Make a unique guide ID based on promoter_ID#Set_{Set}#Photospacer_{Photospacer_number}
ref_tf_long["guide_id"] = ref_tf_long.apply(lambda x: f"{x['promoter_ID']}#Set_{x['Set']}#Photospacer_{x['Photospacer_number']}", axis=1)
ref_tf_long.head()

Unnamed: 0,Gene,promoter_ID,Set,Photospacer,Photospacer_number,guide_id
0,AATF,AATF_-_35306286.23-P1P2,A,GAGTGGCCGGTCCAGAGCTG,1,AATF_-_35306286.23-P1P2#Set_A#Photospacer_1
1,AATF,AATF_-_35306286.23-P1P2,A,GGGATCAAGGCGAGAGGATC,2,AATF_-_35306286.23-P1P2#Set_A#Photospacer_2
2,AATF,AATF_-_35306351.23-P1P2,B,GAAGGCGAGAGGATCCGGCA,3,AATF_-_35306351.23-P1P2#Set_B#Photospacer_3
3,AATF,AATF_-_35306351.23-P1P2,B,GGGAATCGGATCAAGGCGAG,4,AATF_-_35306351.23-P1P2#Set_B#Photospacer_4
4,AATF,AATF_-_35306333.23-P1P2,C,GGAGTCGGGGAATCGGATCA,5,AATF_-_35306333.23-P1P2#Set_C#Photospacer_5


## Reference positive controls

In [4]:
def clean_gene_and_make_guide(x):
    x = x.strip()
    
    # --- Remove parentheses in gene names like "CD29 (ITGB1)" ---
    base = re.sub(r"\s*\([^)]*\)", "", x).strip()
    
    # --- Handle sgRNA naming ---
    if "sgRNA" in x:
        base_gene = x.split("sgRNA")[0].strip()
        suffix = x.split("sgRNA")[-1].strip().lower()
        if suffix == "main":
            guide_id = f"{base_gene}#A"
        elif suffix in ["a", "b", "c"]:
            guide_id = f"{base_gene}#{suffix.upper()}"
        else:
            guide_id = f"{base_gene}#A"
        return pd.Series([base_gene, guide_id])
    
    # --- Handle "strong"/"weak" ---
    if re.search(r"\bstrong\b|\bweak\b", x, re.IGNORECASE):
        base_gene = re.sub(r"\s*(strong|weak)\s*", "", base, flags=re.IGNORECASE)
        strength = "strong" if "strong" in x.lower() else "weak"
        guide_id = f"{base_gene}#{strength}"
        return pd.Series([base_gene, guide_id])
    
    # --- Default case: use cleaned base name ---
    return pd.Series([base, f"{base}#A"])

# Load reference 
path_ref_pc_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/positive_controls.tsv"
ref_pcs = pd.read_csv(path_ref_pc_metadata, sep="\t")
ref_pcs = ref_pcs.rename(columns={"Photospacer (represent 10 times)": "Photospacer"})
ref_pcs[["Gene", "guide_id"]] = ref_pcs["Gene"].apply(clean_gene_and_make_guide)
ref_pcs.head()

Unnamed: 0,Gene,Photospacer,Reference,guide_id
0,CD81,GGAGAGCGAGCGCGCAACGG,"Horlbeck et al. 2016 ""Compact and highly activ...",CD81#strong
1,CD81,GGAGAGCCAGCGCGCAACGG,"Jost et al. 2020 ""Titrating gene expression us...",CD81#weak
2,CD151,GCCGGACTCGGACGCGTGGT,"Horlbeck et al. 2016 ""Compact and highly activ...",CD151#strong
3,CD151,GCCGCTCGGCCGAGCTGTCG,"Horlbeck et al. 2016 ""Compact and highly activ...",CD151#weak
4,CD55,GCTGCGACTCGGCGGAGTCC,"Horlbeck et al. 2016 ""Compact and highly activ...",CD55#strong


## Reference non-targeting controls

In [5]:
path_ref_nt_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/non_targeting.tsv"
ref_nt = pd.read_csv(path_ref_nt_metadata, sep="\t")
ref_nt = ref_nt.rename(columns={"Unnamed: 0": "guide_id", "Photospacer (same for all 3 sets)": "Photospacer"})
ref_nt["Gene"] = "non-targeting"
ref_nt.head()

Unnamed: 0,guide_id,Photospacer,Gene
0,non-targeting_00642,GGAGTTAAGGCCTCGTCTAG,non-targeting
1,non-targeting_00718,GTCCCAGGCTCTCCACTATG,non-targeting
2,non-targeting_03631,GGACGCGTCTGCAAGAACGT,non-targeting
3,non-targeting_03705,GGGCATGGACCCGCGGCACG,non-targeting
4,non-targeting_01469,GCGTCCGAGGTACTGAATAA,non-targeting


## Reference negative controls (targeting)

In [6]:
path_ref_nc_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/negative_controls.tsv"
spacer_cols = ['Photospacer 1', 'Photospacer 2', 'Photospacer 3', 'Photospacer 4', 'Photospacer 5', 'Photospacer 6']
ref_nc = pd.read_csv(path_ref_nc_metadata, sep="\t")
for col in spacer_cols:
    ref_nc[col] = ref_nc[col].str.upper()

# Melt the dataframe into long format
ref_nc_long = ref_nc.melt(
    id_vars=["Gene"],
    value_vars=[col for col in ref_nc.columns if col.startswith("Photospacer")],
    var_name="Photospacer_label",
    value_name="Photospacer"
)

# Extract the numeric part of the photospacer label
ref_nc_long["Photospacer_number"] = (
    ref_nc_long["Photospacer_label"]
    .str.extract(r"(\d+)").astype(int)
)

# Create guide_id
ref_nc_long["guide_id"] = ref_nc_long.apply(lambda x: f"{x['Gene']}#Set_A#Photospacer_{x['Photospacer_number']}", axis=1)
ref_nc_long = ref_nc_long[["Gene", "Photospacer", "Photospacer_number", "guide_id"]]
ref_nc_long.head()

Unnamed: 0,Gene,Photospacer,Photospacer_number,guide_id
0,OR1J4,GAGGAGGAGAGTGTGAGACA,1,OR1J4#Set_A#Photospacer_1
1,OR10K1,GCTTCTATAAAGGAGAGTCA,1,OR10K1#Set_A#Photospacer_1
2,OR5L2,GCTGCATAAATTGGAGACAT,1,OR5L2#Set_A#Photospacer_1
3,OR52W1,GCTCCTGACAGGGAAGATAA,1,OR52W1#Set_A#Photospacer_1
4,OR8K1,GTCACAGTGATAGGCAATCT,1,OR8K1#Set_A#Photospacer_1


# Load guide metadata

In [46]:
path_guide_metadata = "/cellar/users/aklie/data/datasets/tf_perturb_seq/datasets/Hon_WTC11-benchmark_TF-Perturb-seq/downloads/IGVFFI5765HMZH.tsv"
guide_metadata = pd.read_csv(path_guide_metadata)
guide_metadata["Photospacer"] = "G" + guide_metadata["spacer"].str.upper()
print(f"{len(guide_metadata)} total guides")
guide_metadata.head()

415 total guides


Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer
0,chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963540,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGGCGGGTTAACGAAGACC
1,chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963579,-,NGG,ENSG00000072364,chr5,132963540,132963638,GTCGCCGCCGCCAGCGGACG
2,chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963567,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGATCCCCGCCCCGTCCGC
3,chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963620,+,NGG,ENSG00000072364,chr5,132963540,132963638,GAGGGCTGTGACTGACGCAG
4,chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963570,+,NGG,ENSG00000072364,chr5,132963540,132963638,GTCCCCGCCCCGTCCGCTGG


## Clean up positiive control metadata

In [47]:
# Clean up
# 1. For those that have type=="positive control", change "targeting" to True and type to "targeting"
guide_metadata.loc[guide_metadata["type"] == "positive control", "targeting"] = True
#guide_metadata.loc[guide_metadata["type"] == "positive control", "type"] = "targeting"

In [48]:
guide_metadata[~guide_metadata["targeting"]]

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer
385,non-targeting_00642,GAGTTAAGGCCTCGTCTAG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGAGTTAAGGCCTCGTCTAG
386,non-targeting_00718,TCCCAGGCTCTCCACTATG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTCCCAGGCTCTCCACTATG
387,non-targeting_03631,GACGCGTCTGCAAGAACGT,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGACGCGTCTGCAAGAACGT
388,non-targeting_03705,GGCATGGACCCGCGGCACG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGGCATGGACCCGCGGCACG
389,non-targeting_01469,CGTCCGAGGTACTGAATAA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GCGTCCGAGGTACTGAATAA
390,non-targeting_02459,TCTGACTCTCCGTCCACCA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTCTGACTCTCCGTCCACCA
391,non-targeting_00166,AGCCCCGCCTGGGTACGCG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GAGCCCCGCCTGGGTACGCG
392,non-targeting_01967,GGCACCGCCGTTAAAAGTT,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGGCACCGCCGTTAAAAGTT
393,non-targeting_02577,CTACAACCGGCGAGCGATA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GCTACAACCGGCGAGCGATA
394,non-targeting_00954,TAACTGCTACGGCGCCCAA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTAACTGCTACGGCGCCCAA


## Fix missing target names

In [49]:
missing_target_name = guide_metadata[guide_metadata["intended_target_name"].isna()]
len(missing_target_name)

42

In [50]:
# Convert guides to PyRanges
guides_pr = pr.PyRanges(
    df=guide_metadata.rename(columns={
        'intended_target_chr': 'Chromosome',
        'intended_target_start': 'Start',
        'intended_target_end': 'End',
        'guide_id': 'guide_id'
    })
)

  return {k: v for k, v in df.groupby(grpby_key)}


In [51]:
overlaps = guides_pr.join(genes_pr, suffix="_gene")

  empty_removed = df.groupby(["Chromosome", "Strand"])
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.


In [52]:
# Convert to DataFrame
overlaps_df = overlaps.df

In [53]:
# Compute overlap length
overlaps_df["overlap_length"] = (
    overlaps_df[["End", "End_gene"]].min(axis=1)
    - overlaps_df[["Start", "Start_gene"]].max(axis=1)
)
# Only keep positive overlaps
overlaps_df = overlaps_df[overlaps_df["overlap_length"] > 0]

In [54]:
# Keep only the gene with the maximum overlap per guide_id
best_hits = (
    overlaps_df.sort_values("overlap_length", ascending=False)
    .groupby("guide_id", as_index=False)
    .first()
)

# Keep relevant columns
best_hits = best_hits[["guide_id", "gene_id", "gene_name", "overlap_length"]]

In [55]:
# Duplicate guide_ids?
best_hits[best_hits.duplicated(subset=["guide_id"], keep=False)].sort_values("guide_id")

Unnamed: 0,guide_id,gene_id,gene_name,overlap_length


In [56]:
# Merge with original df
guide_metadata = guide_metadata.merge(best_hits, on="guide_id", how="left")

In [57]:
# intended_target_name is gene_id for NaN enties
guide_metadata.loc[guide_metadata["intended_target_name"].isna(), "intended_target_name"] = guide_metadata.loc[guide_metadata["intended_target_name"].isna(), "gene_id"]

In [58]:
len(guide_metadata[guide_metadata["intended_target_name"].isna()])

0

## Add gene name

In [59]:
# Initialize MyGeneInfo
mg = MyGeneInfo()

In [60]:
# Map intended_target_name (Ensembl IDs) to gene names
out = mg.querymany(guide_metadata['intended_target_name'].tolist(), scopes='ensembl.gene', fields='symbol', species='human')
map_df = pd.DataFrame(out)[['query', 'symbol']].rename(columns={'query': 'intended_target_name', 'symbol': 'gene_name'})
guide_metadata = pd.concat([guide_metadata, map_df["gene_name"]], axis=1)
guide_metadata.head()

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
50 input query terms found dup hits:	[('ENSG00000072364', 6), ('ENSG00000117713', 6), ('ENSG00000049618', 12), ('ENSG00000143437', 6), ('
80 input query terms found no hit:	['ENSSDUG00000003529', 'ENSSDUG00000003529', 'ENSSDUG00000003529', 'ENSSDUG00000003529', 'ENSSDUG000


Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id,gene_name,overlap_length,gene_name.1
0,chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963540,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGGCGGGTTAACGAAGACC,ENSG00000072364.13,AFF4,94.0,AFF4
1,chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963579,-,NGG,ENSG00000072364,chr5,132963540,132963638,GTCGCCGCCGCCAGCGGACG,ENSG00000072364.13,AFF4,94.0,AFF4
2,chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963567,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGATCCCCGCCCCGTCCGC,ENSG00000072364.13,AFF4,94.0,AFF4
3,chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963620,+,NGG,ENSG00000072364,chr5,132963540,132963638,GAGGGCTGTGACTGACGCAG,ENSG00000072364.13,AFF4,94.0,AFF4
4,chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963570,+,NGG,ENSG00000072364,chr5,132963540,132963638,GTCCCCGCCCCGTCCGCTGG,ENSG00000072364.13,AFF4,94.0,AFF4


# Inspect TF target metadata

In [61]:
# Grab only the guides targeting TFs
tf_metadata = guide_metadata[guide_metadata["targeting"] == True]

# Check Photospacer column against all spacers in ref_tf_long
tf_metadata["in_ref"] = tf_metadata["Photospacer"].isin(ref_tf_long["Photospacer"])
tf_metadata["in_ref"].value_counts()

tf_metadata

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tf_metadata["in_ref"] = tf_metadata["Photospacer"].isin(ref_tf_long["Photospacer"])


Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id,gene_name,overlap_length,gene_name.1,in_ref
0,chr5:132963540-132963558(+),GGGCGGGTTAACGAAGACC,True,targeting,chr5,132963540,132963540,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGGCGGGTTAACGAAGACC,ENSG00000072364.13,AFF4,94.0,AFF4,True
1,chr5:132963579-132963597(-),TCGCCGCCGCCAGCGGACG,True,targeting,chr5,132963579,132963579,-,NGG,ENSG00000072364,chr5,132963540,132963638,GTCGCCGCCGCCAGCGGACG,ENSG00000072364.13,AFF4,94.0,AFF4,True
2,chr5:132963567-132963585(+),GGATCCCCGCCCCGTCCGC,True,targeting,chr5,132963567,132963567,+,NGG,ENSG00000072364,chr5,132963540,132963638,GGGATCCCCGCCCCGTCCGC,ENSG00000072364.13,AFF4,94.0,AFF4,True
3,chr5:132963620-132963638(+),AGGGCTGTGACTGACGCAG,True,targeting,chr5,132963620,132963620,+,NGG,ENSG00000072364,chr5,132963540,132963638,GAGGGCTGTGACTGACGCAG,ENSG00000072364.13,AFF4,94.0,AFF4,True
4,chr5:132963570-132963588(+),TCCCCGCCCCGTCCGCTGG,True,targeting,chr5,132963570,132963570,+,NGG,ENSG00000072364,chr5,132963540,132963638,GTCCCCGCCCCGTCCGCTGG,ENSG00000072364.13,AFF4,94.0,AFF4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,CD151,CCGCTCGGCCGAGCTGTCG,True,positive control,chrC,0,0,+,NGG,ENSG00000177697,chrPC,0,0,GCCGCTCGGCCGAGCTGTCG,,,,CD151,False
381,CD55,CTGCGACTCGGCGGAGTCC,True,positive control,chrC,0,0,+,NGG,ENSG00000196352,chrPC,0,0,GCTGCGACTCGGCGGAGTCC,,,,CD55,False
382,NGFRAP1,GTTGGAGTTTGCCCTCCTC,True,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GGTTGGAGTTTGCCCTCCTC,,,,,False
383,NGFRAP1,AGGACCGAGAAGAGTGACA,True,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GAGGACCGAGAAGAGTGACA,,,,,False


In [62]:
tf_metadata["intended_target_name"].nunique()

60

In [63]:
tf_metadata["gene_name"].nunique()

gene_name    61
gene_name    53
dtype: int64

In [64]:
# Find any spacers that do not match
mismatched_spacers = tf_metadata[~tf_metadata["in_ref"]]
mismatched_spacers[["intended_target_name", "gene_name", "Photospacer"]]

Unnamed: 0,intended_target_name,gene_name,gene_name.1,Photospacer
60,ENSG00000074266,EED,EED,GCGAAGGAACGGGCCAATTG
61,ENSG00000074266,EED,EED,GGGGTCGGAGATCGAAGGAA
62,ENSG00000074266,EED,EED,GCTGAAACGTCTTTGGAAGG
63,ENSG00000074266,EED,EED,GGCAGCGGGTCGGAGATCGA
64,ENSG00000074266,EED,EED,GGTCTTTGGAAGGAGGAAGG
...,...,...,...,...
380,ENSG00000177697,,CD151,GCCGCTCGGCCGAGCTGTCG
381,ENSG00000196352,,CD55,GCTGCGACTCGGCGGAGTCC
382,ENSBTSG00005026991,,,GGTTGGAGTTTGCCCTCCTC
383,ENSBTSG00005026991,,,GAGGACCGAGAAGAGTGACA


In [65]:
mismatched_spacers["gene_name"].value_counts()

ValueError: Grouper for 'gene_name' not 1-dimensional

# Positive control metadata

In [66]:
# Grab the positive controls
pc_metadata = guide_metadata[guide_metadata["type"] == "positive control"]
pc_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id,gene_name,overlap_length,gene_name.1
378,CD81,GAGAGCCAGCGCGCAACGG,True,positive control,chrC,0,0,+,NGG,ENSG00000110651,chrPC,0,0,GGAGAGCCAGCGCGCAACGG,,,,CD81
379,CD151,CCGGACTCGGACGCGTGGT,True,positive control,chrC,0,0,+,NGG,ENSG00000177697,chrPC,0,0,GCCGGACTCGGACGCGTGGT,,,,CD151
380,CD151,CCGCTCGGCCGAGCTGTCG,True,positive control,chrC,0,0,+,NGG,ENSG00000177697,chrPC,0,0,GCCGCTCGGCCGAGCTGTCG,,,,CD151
381,CD55,CTGCGACTCGGCGGAGTCC,True,positive control,chrC,0,0,+,NGG,ENSG00000196352,chrPC,0,0,GCTGCGACTCGGCGGAGTCC,,,,CD55
382,NGFRAP1,GTTGGAGTTTGCCCTCCTC,True,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GGTTGGAGTTTGCCCTCCTC,,,,
383,NGFRAP1,AGGACCGAGAAGAGTGACA,True,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GAGGACCGAGAAGAGTGACA,,,,
384,TFRC,CTCAGAGCGTCGGGATATC,True,positive control,chrC,0,0,+,NGG,ENSG00000072274,chrPC,0,0,GCTCAGAGCGTCGGGATATC,,,,TFRC


In [69]:
pc_metadata["in_ref"] = pc_metadata["Photospacer"].isin(ref_pcs["Photospacer"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pc_metadata["in_ref"] = pc_metadata["Photospacer"].isin(ref_pcs["Photospacer"])


In [71]:
pc_metadata[~pc_metadata["in_ref"]]

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id,gene_name,overlap_length,gene_name.1,in_ref
382,NGFRAP1,GTTGGAGTTTGCCCTCCTC,True,positive control,chrC,0,0,+,NGG,ENSBTSG00005026991,chrPC,0,0,GGTTGGAGTTTGCCCTCCTC,,,,,False


In [70]:
ref_pcs

Unnamed: 0,Gene,Photospacer,Reference,guide_id
0,CD81,GGAGAGCGAGCGCGCAACGG,"Horlbeck et al. 2016 ""Compact and highly activ...",CD81#strong
1,CD81,GGAGAGCCAGCGCGCAACGG,"Jost et al. 2020 ""Titrating gene expression us...",CD81#weak
2,CD151,GCCGGACTCGGACGCGTGGT,"Horlbeck et al. 2016 ""Compact and highly activ...",CD151#strong
3,CD151,GCCGCTCGGCCGAGCTGTCG,"Horlbeck et al. 2016 ""Compact and highly activ...",CD151#weak
4,CD55,GCTGCGACTCGGCGGAGTCC,"Horlbeck et al. 2016 ""Compact and highly activ...",CD55#strong
5,CD29,GAGAGGCCCAGCGGGAGTCG,"Horlbeck et al. 2016 ""Compact and highly activ...",CD29#A
6,B2M,GGCGAGCACAGCTAAGGCCA,"Horlbeck et al. 2016 ""Compact and highly activ...",B2M#A
7,AARS,GTCTGCGGGAATAGGTGCAG,"Horlbeck et al. 2016 ""Compact and highly activ...",AARS#A
8,AARS,GCGGCGACCCTAGGAGAGGT,"Horlbeck et al. 2016 ""Compact and highly activ...",AARS#B
9,AARS,GCCGCCCTCGGAGAGCTCTG,"Horlbeck et al. 2016 ""Compact and highly activ...",AARS#C


# Non-targeting control metadata

In [45]:
# Grab the non-targeting controls
nt_metadata = guide_metadata[guide_metadata["type"] == "non-targeting"]
nt_metadata

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,PAM,intended_target_name,intended_target_chr,intended_target_start,intended_target_end,Photospacer,gene_id,gene_name,overlap_length,gene_name.1
385,non-targeting_00642,GAGTTAAGGCCTCGTCTAG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGAGTTAAGGCCTCGTCTAG,,,,
386,non-targeting_00718,TCCCAGGCTCTCCACTATG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTCCCAGGCTCTCCACTATG,,,,
387,non-targeting_03631,GACGCGTCTGCAAGAACGT,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGACGCGTCTGCAAGAACGT,,,,
388,non-targeting_03705,GGCATGGACCCGCGGCACG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGGCATGGACCCGCGGCACG,,,,
389,non-targeting_01469,CGTCCGAGGTACTGAATAA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GCGTCCGAGGTACTGAATAA,,,,
390,non-targeting_02459,TCTGACTCTCCGTCCACCA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTCTGACTCTCCGTCCACCA,,,,
391,non-targeting_00166,AGCCCCGCCTGGGTACGCG,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GAGCCCCGCCTGGGTACGCG,,,,
392,non-targeting_01967,GGCACCGCCGTTAAAAGTT,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GGGCACCGCCGTTAAAAGTT,,,,
393,non-targeting_02577,CTACAACCGGCGAGCGATA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GCTACAACCGGCGAGCGATA,,,,
394,non-targeting_00954,TAACTGCTACGGCGCCCAA,False,non-targeting,chrC,0,0,+,NGG,non-targeting,chrPC,0,0,GTAACTGCTACGGCGCCCAA,,,,


In [28]:
# 
len(nt_metadata)

30