# Create harmonized sgRNA guide annotation file for use with the CRISPR pipeline (2025)
This notebook describes the creation of a unified annotation file from the guide annotation files provided by the Hon, Huangfu, and Gersbach labs, according to the specification described in: https://github.com/pinellolab/CRISPR_Pipeline/blob/main/example_data/guide_metadata.tsv 

# Install libraries and set paths

In [1265]:
#%pip install pandas
#%pip install matplotlib
#%pip install numpy
#%pip install seaborn
#%pip install biomart

In [1266]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [1267]:
# Paths: TODO update if necessary
#local_path = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/"
local_path = "C:/Users/seg95/Documents/tf_perturb_seq/"
#local_path = "/hpc/group/gersbachlab/seg95/tf_perturb_seq/ref/"
#local_path = "D:/tf_perturb_seq/"

# Import merged guide reference file, along with guide index file

In [1268]:
# Merged guide ref file
merged_guide_file = pd.read_csv(local_path + "outer_merged_file.csv")
print(merged_guide_file.head())

merged_guide_file_poolabcd = pd.read_csv(local_path + "outer_merged_file_poolabcd.csv")
merged_guide_file_poolf = pd.read_csv(local_path + "outer_merged_file_poolf.csv")

                         id_hon           protospacer       type  \
0    FOXN1_-_26833391.23-P1P2-1  GCACAGGACGGCCGAGCTGA  targeting   
1     EN2_-_155251011.23-P1P2-1  GCTCCGTGTGCGCCGCGGGA  targeting   
2  BCLAF1_-_136610510.23-P1P2-2  GCTCCGTTGCAACCACACAG  targeting   
3      KLF6_-_3827130.23-P1P2-2  GCTGGAGGATCGATCGGCGG  targeting   
4     ELF1_+_41593362.23-P1P2-2  GTGAGCTGATAAACAGAGGG  targeting   

  intended_target_name_hon    reverse_compliment genomic_element  \
0                    FOXN1  TCAGCTCGGCCGTCCTGTGC        promoter   
1                      EN2  TCCCGCGGCGCACACGGAGC        promoter   
2                   BCLAF1  CTGTGTGGTTGCAACGGAGC        promoter   
3                     KLF6  CCGCCGATCGATCCTCCAGC        promoter   
4                     ELF1  CCCTCTGTTTATCAGCTCAC        promoter   

                    id_gersbach intended_target_name_gersbach id_huangfu  \
0    FOXN1_-_26833391.23-P1P2-1                         FOXN1    FOXN1_1   
1     EN2_-_155251011.23-P1P2-

In [1269]:
# sgRNA index files
sgrna_index_poolabcd = pd.read_csv(local_path + "sgRNA_index_v0.csv", sep = "\t")
sgrna_index_poolf = pd.read_csv(local_path + "igvf_poolF_annotation.csv", sep = "\t")

sgrna_index_dacc_annot = pd.read_csv(local_path + "sgRNA_index_dacc_annot_reference.csv", sep = "\t")
print(len(set(sgrna_index_dacc_annot['protospacer']).intersection(set(merged_guide_file_poolabcd['protospacer']))))

def adjust_index_file(sgrna_index, name_sgrna_seq = 'sgRNA_seq', add_leading_G = True):
    if(name_sgrna_seq == "sgRNA_seq"):
        sgrna_index['strand'] = sgrna_index['target_loc'].str.extract(r'\((\+|\-)\)')
        sgrna_index['oligo'] = sgrna_index['oligo'].str.upper()
    else:
        sgrna_index['oligo_sequence'] = sgrna_index['oligo_sequence'].str.upper()
    sgrna_index[name_sgrna_seq] = sgrna_index[name_sgrna_seq].str.upper()
    # Adjust the index file to add leading Gs if needed
    if(add_leading_G):
        sgrna_index[name_sgrna_seq] = 'G' + sgrna_index[name_sgrna_seq]
    return sgrna_index

sgrna_index_poolabcd = adjust_index_file(sgrna_index_poolabcd)
sgrna_index_poolf = adjust_index_file(sgrna_index_poolf, name_sgrna_seq= 'protospacer', add_leading_G = False)

sgrna_index_dacc_annot['protospacer'] = sgrna_index_dacc_annot['protospacer'].str.upper()
sgrna_index_poolf['protospacer'] = sgrna_index_poolf['protospacer'].str.upper()
#sgrna_index_dacc_annot['protospacer'] = [s[1:] if len(s) > 0 else s for s in sgrna_index_dacc_annot['protospacer']]
#sgrna_index_dacc_annot['reverse_compliment'] = sgrna_index_dacc_annot['reverse_compliment'].str.rstrip('C')

13188


In [1270]:
# Add a reverse compliment if needed
def reverse_compliment(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return "".join(complement.get(base, base) for base in reversed(sequence.upper()))

sgrna_index_poolabcd['reverse_compliment'] = sgrna_index_poolabcd['sgRNA_seq'].apply(reverse_compliment)
sgrna_index_poolf.rename(columns={"antisense_sequence": "reverse_compliment"})

print("Index:")
print(sgrna_index_poolabcd.head())
print(sgrna_index_poolf.head())
print("Annot:")
print(sgrna_index_dacc_annot.head())

Index:
                   target_loc              element_seq     target source  \
0  chr17:36948966-36948984(+)  chr17:36948966-36949088  AATF_P1P2     TF   
1  chr17:36949026-36949044(+)  chr17:36948966-36949088  AATF_P1P2     TF   
2  chr17:36949013-36949031(+)  chr17:36948966-36949088  AATF_P1P2     TF   
3  chr17:36949070-36949088(-)  chr17:36948966-36949088  AATF_P1P2     TF   
4  chr17:36949031-36949049(+)  chr17:36948966-36949088  AATF_P1P2     TF   

              sgRNA_seq                                              oligo  \
0  GAGTGGCCGGTCCAGAGCTG  GTGGAAAGGACGAAACACCGAGTGGCCGGTCCAGAGCTGGTTTAAG...   
1  GGGATCAAGGCGAGAGGATC  GTGGAAAGGACGAAACACCGGGATCAAGGCGAGAGGATCGTTTAAG...   
2  GGAGTCGGGGAATCGGATCA  GTGGAAAGGACGAAACACCGGAGTCGGGGAATCGGATCAGTTTAAG...   
3  GAAATGTGCGGCCCAACCCC  GTGGAAAGGACGAAACACCGAAATGTGCGGCCCAACCCCGTTTAAG...   
4  GAAGGCGAGAGGATCCGGCA  GTGGAAAGGACGAAACACCGAAGGCGAGAGGATCCGGCAGTTTAAG...   

  gene_target chr_target  chr_start_target  chr_end_target chr_elem

In [1271]:
sgrna_index_dacc_annot["protospacer_upper"] = sgrna_index_dacc_annot["protospacer"].str.upper() 

print(len(set(sgrna_index_poolabcd['sgRNA_seq']).intersection(sgrna_index_dacc_annot['protospacer_upper'])))

13470


In [1272]:
# Remove leading 'G' from the DACC annot file
def harmonize_leading_G(df_left, df_right, left_col, right_col, debug=True):
    """
    Compare sequence lengths between two DataFrames.
    If all (non‑NaN) sequences in one DataFrame start with 'G' and are one base
    longer than the other, remove the leading 'G' to harmonize lengths.
    If both DataFrames contain a 'reverse_compliment' column, remove one trailing
    'C' from that column as well when trimming Gs.

    Returns (left_fixed, right_fixed)
    """
    import numpy as np
    left = df_left.copy()
    right = df_right.copy()

    # Normalize sequence text
    left[left_col] = left[left_col].astype(str).str.strip().str.upper()
    right[right_col] = right[right_col].astype(str).str.strip().str.upper()

    # Compute basic stats
    left_lens = left[left_col].dropna().str.len()
    right_lens = right[right_col].dropna().str.len()
    avg_left, avg_right = left_lens.mean(), right_lens.mean()

    if debug:
        print(f"Average seq length: left={avg_left:.1f}, right={avg_right:.1f}")
        print("Value_counts of left lengths:", left_lens.value_counts().head().to_dict())
        print("Value_counts of right lengths:", right_lens.value_counts().head().to_dict())

    # Decide which side to trim 
    trimmed = None
    if np.nanmedian(left_lens) == np.nanmedian(right_lens) + 1:
        starts_with_G = left[left_col].dropna().str.startswith("G").all()
        if starts_with_G:
            left[left_col] = left[left_col].str.replace(r"^G", "", regex=True)
            trimmed = "left"
            if debug:
                print(f"Removed leading 'G' from all non‑NaN sequences in '{left_col}'.")
        else:
            if debug:
                mism = left.loc[~left[left_col].dropna().str.startswith("G"), left_col].head(10).tolist()
                print(f"Not all left sequences start with 'G'. Examples: {mism}")

    elif np.nanmedian(right_lens) == np.nanmedian(left_lens) + 1:
        starts_with_G = right[right_col].dropna().str.startswith("G").all()
        if starts_with_G:
            right[right_col] = right[right_col].str.replace(r"^G", "", regex=True)
            trimmed = "right"
            if debug:
                print(f"Removed leading 'G' from all non‑NaN sequences in '{right_col}'.")
        else:
            if debug:
                mism = right.loc[~right[right_col].dropna().str.startswith("G"), right_col].head(10).tolist()
                print(f"Not all right sequences start with 'G'. Examples: {mism}")
    else:
        if debug:
            print("No consistent 19 / 20 bp offset found; no trimming performed.")

    # --- If both have reverse_compliment, trim trailing C accordingly ---
    if "reverse_compliment" in left.columns and "reverse_compliment" in right.columns:
        if trimmed == "left":
            left["reverse_compliment"] = left["reverse_compliment"].astype(str).str.replace(r"C$", "", regex=True)
            if debug:
                print("Trimmed trailing 'C' from left.reverse_compliment.")
        elif trimmed == "right":
            right["reverse_compliment"] = right["reverse_compliment"].astype(str).str.replace(r"C$", "", regex=True)
            if debug:
                print("Trimmed trailing 'C' from right.reverse_compliment.")

    return left, right

In [1273]:
sgrna_index_dacc_annot, sgrna_index_poolabcd = harmonize_leading_G(
    df_left=sgrna_index_dacc_annot,
    df_right=sgrna_index_poolabcd,
    left_col="protospacer_upper",
    right_col="sgRNA_seq",
    debug=True
)

Average seq length: left=20.0, right=20.0
Value_counts of left lengths: {20: 14358}
Value_counts of right lengths: {20: 13563, 21: 18}
No consistent 19 / 20 bp offset found; no trimming performed.


In [1274]:
# Investigate the 18 with 21 bp
extra_long = sgrna_index_poolabcd[sgrna_index_poolabcd["sgRNA_seq"].str.len() == 21]
print(len(extra_long))
print(extra_long["sgRNA_seq"].head())

18
13563    GGGAGAGCCAGCGCGCAACGG
13564    GGCCGGACTCGGACGCGTGGT
13565    GGCCGCTCGGCCGAGCTGTCG
13566    GGCTGCGACTCGGCGGAGTCC
13567    GGAGAGGCCCAGCGGGAGTCG
Name: sgRNA_seq, dtype: object


In [1275]:
# Trim the leading 'G' from those 18
mask = sgrna_index_poolabcd["sgRNA_seq"].str.len() > 20
sgrna_index_poolabcd.loc[mask, "sgRNA_seq"] = (
    sgrna_index_poolabcd.loc[mask, "sgRNA_seq"].str[1:]
)
print(sgrna_index_poolabcd["sgRNA_seq"].str.len().value_counts().head())

if "reverse_compliment" in sgrna_index_poolabcd.columns:
    sgrna_index_poolabcd.loc[mask, "reverse_compliment"] = (
        sgrna_index_poolabcd.loc[mask, "reverse_compliment"].astype(str).str[:-1]
    )
    print("Also removed trailing base from reverse_compliment for those rows.")

sgRNA_seq
20    13581
Name: count, dtype: int64
Also removed trailing base from reverse_compliment for those rows.


In [1276]:
# Merge pool A-D index and DACC files into one; pool F file has sufficient info for matching
sgrna_index_merged = pd.merge(
    sgrna_index_dacc_annot,
    sgrna_index_poolabcd,
    left_on=['protospacer_upper', 'reverse_compliment'],
    right_on=['sgRNA_seq', 'reverse_compliment'],
    how="outer"
)
print(sgrna_index_merged.head())
print(sgrna_index_merged.shape)

               protospacer_ID           protospacer intended_target_name  \
0                     OR5K2-2  GAAAAAATTGTAGAGGAATA                OR5K2   
1    SP1_+_53773993.23-P1P2-1  GAAAAACGCGGACGCTGACG                  SP1   
2    SP8_-_20826141.23-P1P2-2  GAAAAAGATCCTCTGAGAGG                  SP8   
3    FOXN3_-_89883583.23-P2-1  GAAAAAGGCGACACATGACC                FOXN3   
4  ZNF85_+_21106076.23-P1P2-1  GAAAACAAGACCTAGAGCTC                ZNF85   

        type genomic_element    reverse_compliment     protospacer_upper  \
0  targeting        promoter  TATTCCTCTACAATTTTTTC  GAAAAAATTGTAGAGGAATA   
1  targeting        promoter  CGTCAGCGTCCGCGTTTTTC  GAAAAACGCGGACGCTGACG   
2  targeting        promoter  CCTCTCAGAGGATCTTTTTC  GAAAAAGATCCTCTGAGAGG   
3  targeting        promoter  GGTCATGTGTCGCCTTTTTC  GAAAAAGGCGACACATGACC   
4  targeting        promoter  GAGCTCTAGGTCTTGTTTTC  GAAAACAAGACCTAGAGCTC   

                   target_loc              element_seq      target  ...  \
0          

# Check out positive/ negative controls

In [1277]:
neg_controls = pd.read_csv(local_path + "negative_controls.tsv", sep = "\t")
pos_controls = pd.read_csv(local_path + "positive_controls.tsv", sep = "\t")
non_targeting = pd.read_csv(local_path + "non_targeting.tsv", sep = "\t")

print(non_targeting.head())
print(pos_controls.head())
print(neg_controls.head())

            Unnamed: 0 Photospacer (same for all 3 sets)
0  non-targeting_00642              GGAGTTAAGGCCTCGTCTAG
1  non-targeting_00718              GTCCCAGGCTCTCCACTATG
2  non-targeting_03631              GGACGCGTCTGCAAGAACGT
3  non-targeting_03705              GGGCATGGACCCGCGGCACG
4  non-targeting_01469              GCGTCCGAGGTACTGAATAA
           Gene Photospacer (represent 10 times)  \
0   CD81 strong             GGAGAGCGAGCGCGCAACGG   
1     CD81 weak             GGAGAGCCAGCGCGCAACGG   
2  CD151 strong             GCCGGACTCGGACGCGTGGT   
3    CD151 weak             GCCGCTCGGCCGAGCTGTCG   
4   CD55 strong             GCTGCGACTCGGCGGAGTCC   

                                           Reference  
0  Horlbeck et al. 2016 "Compact and highly activ...  
1  Jost et al. 2020 "Titrating gene expression us...  
2  Horlbeck et al. 2016 "Compact and highly activ...  
3  Horlbeck et al. 2016 "Compact and highly activ...  
4  Horlbeck et al. 2016 "Compact and highly activ...  
     Gene      

In [1278]:
print(len(set(sgrna_index_merged['protospacer_upper']).intersection(set(non_targeting['Photospacer (same for all 3 sets)']))))
print(len(set(sgrna_index_merged['protospacer_upper']).intersection(set(pos_controls['Photospacer (represent 10 times)']))))  
cols = [c for c in neg_controls.columns if c.startswith('Photospacer')]
neg_spacers = pd.concat([neg_controls[c] for c in cols]).dropna().astype(str)

len(set(sgrna_index_merged['protospacer_upper']).intersection(set(neg_spacers)))

600
19


577

In [1279]:
# Import additional file to add missing coordinates
poolD_coords = pd.read_csv(local_path + "pool_D_controls.csv", sep = "\t")
print(poolD_coords.head())

                     guide_id   intended_target_region gene_target source  \
0  chr16:70289419-70289437(-)  chr16:70289409-70289495   AARS_P1P2     PC   
1  chr16:70289477-70289495(-)  chr16:70289409-70289495   AARS_P1P2     PC   
2  chr16:70289454-70289472(-)  chr16:70289409-70289495   AARS_P1P2     PC   
3  chr16:70289423-70289441(-)  chr16:70289409-70289495   AARS_P1P2     PC   
4  chr16:70289463-70289481(-)  chr16:70289409-70289495   AARS_P1P2     PC   

                spacer                                     oligo_sequence  \
0  CGGCGACCCTAGGAGAGGT  GTGGAAAGGACGAAACACCGcggcgaccctaggagaggtGTTTAAG...   
1  TCTGCGGGAATAGGTGCAG  GTGGAAAGGACGAAACACCGtctgcgggaataggtgcagGTTTAAG...   
2  CCCTTGGCGGGGGACTCTG  GTGGAAAGGACGAAACACCGcccttggcgggggactctgGTTTAAG...   
3  GGGACGGCGACCCTAGGAG  GTGGAAAGGACGAAACACCGgggacggcgaccctaggagGTTTAAG...   
4  TGCAGCGGGCCCTTGGCGG  GTGGAAAGGACGAAACACCGtgcagcgggcccttggcggGTTTAAG...   

    reverse_compliment intended_target  intended_target_start  \
0  ACCTCT

In [1280]:
sgrna_index_merged, poolD_coords = harmonize_leading_G(
    df_left=sgrna_index_merged,
    df_right=poolD_coords,
    left_col="protospacer_upper",
    right_col="spacer",
    debug=True
)
print(sgrna_index_merged["protospacer_upper"].str.len().value_counts().head())
print(poolD_coords["spacer"].str.len().value_counts().head())

Average seq length: left=20.0, right=19.0
Value_counts of left lengths: {20: 14451}
Value_counts of right lengths: {19: 570}
Removed leading 'G' from all non‑NaN sequences in 'protospacer_upper'.
Trimmed trailing 'C' from left.reverse_compliment.
protospacer_upper
19    14451
Name: count, dtype: int64
spacer
19    570
Name: count, dtype: int64


In [1281]:
def debug_poolD_matches(sgrna_df, poolD_df):
    sgrna_df = sgrna_df.copy()
    poolD_df = poolD_df.copy()

    # Normalize spacers
    sgrna_df["spacer_norm"] = sgrna_df["protospacer_upper"].astype(str).str.strip().str.upper()
    poolD_df["spacer_norm"] = poolD_df["spacer"].astype(str).str.strip().str.upper()

    # Define "broken" or placeholder coordinates
    placeholder_vals = ["chrPC", "chrPC:0-0", "0", "chrNA", "NA", "nan", ""]

    broken_mask = (
        sgrna_df["chr_target"].astype(str).isin(placeholder_vals)
        | sgrna_df["chr_start_target"].astype(str).isin(placeholder_vals)
        | sgrna_df["chr_end_target"].astype(str).isin(placeholder_vals)
    )
    broken_rows = sgrna_df[broken_mask]

    print(f"Total rows with placeholder coordinates: {broken_rows.shape[0]}")

    # How many of these have a matching Pool D spacer?
    matched = broken_rows["spacer_norm"].isin(poolD_df["spacer_norm"])
    print(f"Of those, {matched.sum()} have a matching spacer in Pool D")

    # Inspect a few examples of matched vs unmatched
    print("\nExample matched spacers:")
    print(broken_rows.loc[matched, ["protospacer_upper", "chr_target", "protospacer_ID", "target_loc"]].head())
    print(broken_rows.loc[matched, ["protospacer_ID"]])

    print("\nExample unmatched spacers:")
    print(broken_rows.loc[~matched, ["protospacer_upper", "chr_target", "protospacer_ID", "target_loc"]].head())

    return broken_rows.loc[matched]

broken_with_matches = debug_poolD_matches(sgrna_index_merged, poolD_coords)

Total rows with placeholder coordinates: 888
Of those, 13 have a matching spacer in Pool D

Example matched spacers:
        protospacer_upper chr_target protospacer_ID target_loc
1360  AGAGGCCCAGCGGGAGTCG      chrPC           CD29  chrPC:0-0
4412  CCGCTCGGCCGAGCTGTCG      chrPC     CD151_weak  chrPC:0-0
4437  CCGGACTCGGACGCGTGGT      chrPC   CD151_strong  chrPC:0-0
5657  CGGCGACCCTAGGAGAGGT      chrPC         AARS_B  chrPC:0-0
6268  CTCAGAGCGTCGGGATATC      chrPC           TFRC  chrPC:0-0
      protospacer_ID
1360            CD29
4412      CD151_weak
4437    CD151_strong
5657          AARS_B
6268            TFRC
6639            CD55
7566     CD81_strong
9026             B2M
9907      DNAJC19_ B
9920       POLR1D_ B
10289      DNAJC19_C
13185      AARS_main
13970    POLR1D_main

Example unmatched spacers:
      protospacer_upper chr_target             protospacer_ID target_loc
0   AAAAAATTGTAGAGGAATA        NaN                    OR5K2-2        NaN
15  AAAAGCGGCGCAGTATTTG        NaN   

In [1282]:
def fill_from_poolD(sgrna_df, poolD_df):
    sgrna_df = sgrna_df.copy()
    poolD_df = poolD_df.copy()

    # Normalize spacers
    sgrna_df["spacer_norm"] = sgrna_df["protospacer_upper"].astype(str).str.strip().str.upper()
    poolD_df["spacer_norm"] = poolD_df["spacer"].astype(str).str.strip().str.upper()

    # Columns to fill and their mapping from Pool D
    column_mapping = {
        "guide_chr"             : "chr_target",
        "guide_start"           : "chr_start_target",
        "guide_end"             : "chr_end_target",
        "strand_pd"             : "strand",
        "intended_target_chr"   : "chr_element",
        "intended_target_start" : "chr_start_element",
        "intended_target_end"   : "chr_end_element",
    }

    # Keep only relevant columns from Pool D
    pool_lookup = poolD_df[["spacer_norm"] + list(column_mapping.keys())]

    # Merge Pool D into sgrna_df
    merged = pd.merge(sgrna_df, pool_lookup, on="spacer_norm", how="left", suffixes=('', '_poolD'))

    # Identify rows with placeholders / missing data
    invalid_vals = ["chrPC", "chrPC:0-0", "0", "chrNA", "NA", "nan", ""]
    broken_mask = (
        merged["chr_target"].isna() |
        merged["chr_target"].astype(str).isin(invalid_vals) |
        merged["chr_start_target"].isna() |
        merged["chr_start_target"].astype(str).isin(invalid_vals) |
        merged["chr_end_target"].isna() |
        merged["chr_end_target"].astype(str).isin(invalid_vals) 
    )

    total_broken = broken_mask.sum()
    print(f"Total broken rows: {total_broken}")

    # Rows that have a matching Pool D spacer
    poolD_matches = broken_mask & merged["guide_chr"].notna()
    print(f"Broken rows with Pool D match: {poolD_matches.sum()}")

    # Fill all columns at once using mapping
    for pool_col, sgrna_col in column_mapping.items():
        if pool_col in merged.columns and sgrna_col in merged.columns:
            merged.loc[poolD_matches, sgrna_col] = merged.loc[poolD_matches, pool_col]

    merged["target_loc"] = merged["chr_target"].combine_first(pd.Series(invalid_vals)) + ":" + \
                       merged["chr_start_target"].astype(str) + "-" + merged["chr_end_target"].astype(str)

    merged["element_seq"] = merged["chr_element"].combine_first(pd.Series(invalid_vals)) + ":" + \
                        merged["chr_start_element"].astype(str) + "-" + merged["chr_end_element"].astype(str)

    # Drop temporary columns
    merged.drop(columns=["spacer_norm"] + list(column_mapping.keys()), inplace=True, errors="ignore")

    return merged

invalid_vals = ["chrPC", "chrPC:0-0", "0", "chrNA", "NA", "nan", ""]
missing_before = sgrna_index_merged[
    sgrna_index_merged["chr_target"].isna() |
    sgrna_index_merged["chr_target"].astype(str).isin(invalid_vals) |
    sgrna_index_merged["chr_start_target"].isna() |
    sgrna_index_merged["chr_start_target"].astype(str).isin(invalid_vals) |
    sgrna_index_merged["chr_end_target"].isna() |
    sgrna_index_merged["chr_end_target"].astype(str).isin(invalid_vals)
]
print("Rows missing before:", missing_before.shape[0])
broken_indices = set(missing_before.index)

poolD_coords = poolD_coords.rename(columns={"strand": "strand_pd"})
sgrna_index_merged = fill_from_poolD(sgrna_index_merged, poolD_coords)

missing_after = sgrna_index_merged[
    sgrna_index_merged["chr_target"].isna() |
    sgrna_index_merged["chr_target"].astype(str).isin(invalid_vals) |
    sgrna_index_merged["chr_start_target"].isna() |
    sgrna_index_merged["chr_start_target"].astype(str).isin(invalid_vals) |
    sgrna_index_merged["chr_end_target"].isna() |
    sgrna_index_merged["chr_end_target"].astype(str).isin(invalid_vals)
]

print("Rows missing after:", missing_after.shape[0])
fixed_indices = list(broken_indices - set(missing_after.index))
print(f"Rows newly filled ({len(fixed_indices)}):")
print(sgrna_index_merged.loc[fixed_indices])

sgrna_index_merged.to_csv(local_path + "sgRNA_index_v0_dacc_merged.csv", index=False)

Rows missing before: 888
Total broken rows: 888
Broken rows with Pool D match: 13
Rows missing after: 875
Rows newly filled (13):
      protospacer_ID           protospacer intended_target_name       type  \
9920       POLR1D_ B  GGGAAGCAAGGACCGACCGA               POLR1D  targeting   
13185      AARS_main  GTCTGCGGGAATAGGTGCAG                 AARS  targeting   
9026             B2M  GGCGAGCACAGCTAAGGCCA                  B2M  targeting   
7566     CD81_strong  GGAGAGCGAGCGCGCAACGG                 CD81  targeting   
6639            CD55  GCTGCGACTCGGCGGAGTCC                 CD55  targeting   
1360            CD29  GAGAGGCCCAGCGGGAGTCG                 CD29  targeting   
10289      DNAJC19_C  GGGATGAGCCGTGCTCCCGG              DNAJC19  targeting   
13970    POLR1D_main  GTGTCCCATAGCGCGAGGCG               POLR1D  targeting   
9907      DNAJC19_ B  GGGAACTCCTGTAAGGTCAG              DNAJC19  targeting   
4437    CD151_strong  GCCGGACTCGGACGCGTGGT                CD151  targeting   
4412      CD

# Reformat to resemble input to the CRISPR pipeline

In [1283]:
# Import example file for the CRISPR pipeline
example_crispr_file = pd.read_csv(local_path + "crispr_annot_sample.tsv", sep = "\t")

In [1284]:
example_crispr_file.head()

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,pam,intended_target_name,intended_target_chr,intended_target_start,intended_target_end
0,AFF4_sg1,CCAGCGGACGGGGCGGGGAC,True,targeting,chr5,132299282.0,132299302.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
1,AFF4_sg2,CCGCCAGCGGACGGGGCGGC,True,targeting,chr5,132299282.0,132299302.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
2,AFF4_sg3,CGTCCGCTGGCGGCGGCGAC,True,targeting,chr5,132299252.0,132299272.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
3,AFF4_sg4,CTGCGTCAGTCACAGCCCTC,True,targeting,chr5,132299279.0,132299299.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
4,AFF4_sg5,GCGGACGGGGCGGGGATCCC,True,targeting,chr5,132299279.0,132299299.0,-,NGG,AFF4,chr5,132875395.0,132963634.0


In [1285]:
# Keep only necessary columns and reorder them to match 
def prune_and_rename_cols(merged_guide_file, is_pool_f=False):
    # start from full table so you don't drop rows prematurely
    df = merged_guide_file.copy()

    if is_pool_f:
        df["guide_id"] = (
            df.get("id_gersbach").combine_first(df.get("id_engreitz"))
        )
        df["intended_target_name"] = (
            df.get("intended_target_name_gersbach")
            .combine_first(df.get("intended_target_name_engreitz"))
        )
    else:
        df["guide_id"] = (
            df.get("id_hon")
            .combine_first(df.get("id_gersbach"))
            .combine_first(df.get("id_engreitz"))
            .combine_first(df.get("id_huangfu"))
        )
        df["intended_target_name"] = (
            df.get("intended_target_name_hon")
            .combine_first(df.get("intended_target_name_gersbach"))
            .combine_first(df.get("intended_target_name_engreitz"))
            .combine_first(df.get("intended_target_name_huangfu"))
        )

    # Fallbacks for control / non‑targeting rows.
    #df["guide_id"] = df["guide_id"].fillna(df.get("id", df.get("protospacer")))
    
    # if 'intended_target_name' is NA, set explicitly to np.nan; otherwise fill from 'type' if available
    df["intended_target_name"] = np.where(
        df["intended_target_name"].isna(),
        df.get("type", np.nan),
        df["intended_target_name"]
    )

    # Rename after all adjustments
    if "protospacer" in df.columns:
        df = df.rename(columns={"protospacer": "spacer"})

    keep_cols = [c for c in ["guide_id", "spacer", "type", "intended_target_name", "reverse_compliment"] if c in df.columns]
    ref_clean_sub = df[keep_cols].copy()

    print(f"Retained rows: {ref_clean_sub.shape[0]}")
    return ref_clean_sub

# Call function
ref_clean_sub = prune_and_rename_cols(merged_guide_file)
ref_clean_sub_poolabcd = prune_and_rename_cols(merged_guide_file_poolabcd)
ref_clean_sub_poolf = prune_and_rename_cols(merged_guide_file_poolf, is_pool_f=True)

Retained rows: 19956
Retained rows: 17364
Retained rows: 2592


In [1286]:
# Remove leading 'G' if all spacers have one
def strip_leading_G(df, column="spacer"):
    df = df.copy()

    # Normalize to uppercase strings for checking
    seqs = df[column].astype(str).str.strip().str.upper()
    non_empty = seqs[seqs != ""]

    # Split by length
    lens = non_empty.str.len()
    long_seqs = non_empty[lens >= 20]
    short_seqs = non_empty[lens < 20]

    print(f"Found {len(long_seqs)} sequences with ≥20 nt and {len(short_seqs)} sequences with <20.")

    if long_seqs.empty:
        print("No sequences ≥20 nt — nothing to test.")
        return df
    # For 20 bp sequences, check if they all start with G
    starts_with_G = long_seqs.str.startswith("G").all()

    if starts_with_G:
        df[column] = seqs.str.replace(r"^G", "", regex=True)
        print(f"All ≥20‑nt sequences start with 'G' - removed leading 'G' from every {column}.")
    else:
        print("Not all ≥20‑nt sequences start with 'G' - leaving data unchanged.")
        non_g = long_seqs[~long_seqs.str.startswith("G")]
        if not non_g.empty:
            show_n = min(10, len(non_g))
            print(f"{len(non_g)} long sequences lack leading 'G'; examples:")
            for s in non_g.head(show_n):
                print(" ", s)

    return df

ref_clean_sub = strip_leading_G(ref_clean_sub)
ref_clean_sub_poolabcd = strip_leading_G(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = strip_leading_G(ref_clean_sub_poolf)

Found 17250 sequences with ≥20 nt and 2706 sequences with <20.
All ≥20‑nt sequences start with 'G' - removed leading 'G' from every spacer.
Found 17250 sequences with ≥20 nt and 114 sequences with <20.
All ≥20‑nt sequences start with 'G' - removed leading 'G' from every spacer.
Found 0 sequences with ≥20 nt and 2592 sequences with <20.
No sequences ≥20 nt — nothing to test.


In [1287]:
# Change 'type' column to targeting/ non-targeting (optional), and then add a 'label' column containing information about positive/ negative controls
def simplify_type_column(df):
    df = df.copy()
    # Copy original type values into 'label'
    # Simplify 'type' values, if needed
    #df["type"] = np.where(df["type"] == "non_targeting", "non_targeting", "targeting")

    df["label"] = df["type"].replace({
        "targeting": "tf_targeting"
    })

        # Map to human-readable versions for 'type'
    type_map = {
        "non_targeting": "non-targeting",
        "negative_control": "negative control",
        "positive_control": "positive control",
        "tf_targeting": "targeting",
    }
    
    df["type"] = df["type"].map(type_map).fillna(df["type"])

    return df
    
ref_clean_sub = simplify_type_column(ref_clean_sub)
ref_clean_sub_poolabcd = simplify_type_column(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = simplify_type_column(ref_clean_sub_poolf)

In [1288]:
# Add 'targeting' column; if type == targeting, set to True, otherwise False
def check_targeting(value):
    return value.lower() in {"tf_targeting", "negative_control", "positive_control"}

def add_targeting_col(ref_clean_sub):
    ref_clean_sub['targeting'] = ref_clean_sub['label'].apply(check_targeting)
    order = ['guide_id', 'spacer', 'targeting', 'type', 'intended_target_name', 'label']
    ref_clean_sub = ref_clean_sub[order]
    print(ref_clean_sub.head())
    return ref_clean_sub

ref_clean_sub = add_targeting_col(ref_clean_sub)
ref_clean_sub_poolabcd = add_targeting_col(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = add_targeting_col(ref_clean_sub_poolf)

                       guide_id               spacer  targeting       type  \
0    FOXN1_-_26833391.23-P1P2-1  CACAGGACGGCCGAGCTGA       True  targeting   
1     EN2_-_155251011.23-P1P2-1  CTCCGTGTGCGCCGCGGGA       True  targeting   
2  BCLAF1_-_136610510.23-P1P2-2  CTCCGTTGCAACCACACAG       True  targeting   
3      KLF6_-_3827130.23-P1P2-2  CTGGAGGATCGATCGGCGG       True  targeting   
4     ELF1_+_41593362.23-P1P2-2  TGAGCTGATAAACAGAGGG       True  targeting   

  intended_target_name         label  
0                FOXN1  tf_targeting  
1                  EN2  tf_targeting  
2               BCLAF1  tf_targeting  
3                 KLF6  tf_targeting  
4                 ELF1  tf_targeting  
                     guide_id               spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  CATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  TTTTTCCATTATAACTTGG       True   
3                     OR8B3-5  TTTTTG

In [1289]:
# Remove "-" and appendix from intended target name
def clean_gene_names(df, col='intended_target_name'):
    df = df.copy()

    # Regex pattern: dash followed by a single digit at the end
    # Exclude NKX family genes
    pattern = r'^(?!NKX)[A-Za-z0-9]+-(\d)$'
    
    # Boolean mask for rows that match the pattern
    mask = df[col].str.contains(pattern, regex=True, na=False)
    
    # Print the original values that will be modified
    print("Modified gene names:")
    print(df.loc[mask, col].tolist())
    
    # Remove the dash and the single digit
    df.loc[mask, col] = df.loc[mask, col].str.replace(
        r'-(\d)$', '', regex=True
    )
    
    return df

ref_clean_sub = clean_gene_names(ref_clean_sub)
ref_clean_sub_poolabcd = clean_gene_names(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = clean_gene_names(ref_clean_sub_poolf)

Modified gene names:
['OR2C3-6', 'OR1N2-5', 'OR9Q1-1', 'OR2C3-2', 'OR2H1-1', 'OR11H6-5', 'OR9Q1-2', 'OR2W1-1', 'OR2C3-3', 'OR56A4-2', 'OR2W1-2', 'OR9Q1-6', 'OR2C3-4', 'OR2C3-1', 'OR56A4-4', 'OR11H6-1', 'OR2W1-5', 'OR56A4-3', 'OR9Q1-3', 'OR9Q1-5', 'OR2C3-5']
Modified gene names:
['OR2C3-6', 'OR56A4-4', 'OR11H6-1', 'OR2W1-5', 'OR9Q1-5', 'OR1N2-5', 'OR9Q1-1', 'OR2C3-2', 'OR2H1-1', 'OR2C3-3', 'OR56A4-3', 'OR2W1-2', 'OR9Q1-6', 'OR9Q1-3', 'OR2C3-5', 'OR11H6-5', 'OR9Q1-2', 'OR2W1-1', 'OR2C3-4', 'OR56A4-2', 'OR2C3-1']
Modified gene names:
[]


  mask = df[col].str.contains(pattern, regex=True, na=False)
  mask = df[col].str.contains(pattern, regex=True, na=False)
  mask = df[col].str.contains(pattern, regex=True, na=False)


In [1290]:
# Add PAM
def add_pam(ref_clean_sub):
    ref_clean_sub['pam'] = np.nan
     # Assign 'NGG' only to targeting rows 
    ref_clean_sub.loc[ref_clean_sub["targeting"] == True, "pam"] = "NGG"
    print(ref_clean_sub.head())
    print(ref_clean_sub.shape)
    return ref_clean_sub

ref_clean_sub = add_pam(ref_clean_sub)
ref_clean_sub_poolabcd = add_pam(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = add_pam(ref_clean_sub_poolf)

                       guide_id               spacer  targeting       type  \
0    FOXN1_-_26833391.23-P1P2-1  CACAGGACGGCCGAGCTGA       True  targeting   
1     EN2_-_155251011.23-P1P2-1  CTCCGTGTGCGCCGCGGGA       True  targeting   
2  BCLAF1_-_136610510.23-P1P2-2  CTCCGTTGCAACCACACAG       True  targeting   
3      KLF6_-_3827130.23-P1P2-2  CTGGAGGATCGATCGGCGG       True  targeting   
4     ELF1_+_41593362.23-P1P2-2  TGAGCTGATAAACAGAGGG       True  targeting   

  intended_target_name         label  pam  
0                FOXN1  tf_targeting  NGG  
1                  EN2  tf_targeting  NGG  
2               BCLAF1  tf_targeting  NGG  
3                 KLF6  tf_targeting  NGG  
4                 ELF1  tf_targeting  NGG  
(19956, 7)
                     guide_id               spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  CATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  TTTTTCCATTATAACTTGG       True

  ref_clean_sub.loc[ref_clean_sub["targeting"] == True, "pam"] = "NGG"
  ref_clean_sub.loc[ref_clean_sub["targeting"] == True, "pam"] = "NGG"
  ref_clean_sub.loc[ref_clean_sub["targeting"] == True, "pam"] = "NGG"


In [1291]:
# Add genomic element column (promoters for everything by non-targeting)
def add_genomic_element(ref_clean_sub):
    ref_clean_sub['genomic_element'] = pd.Series(
        ['promoter' if x != 'non_targeting' else pd.NA for x in ref_clean_sub['label']],
        dtype="object"
    )
    return ref_clean_sub

ref_clean_sub = add_genomic_element(ref_clean_sub)
ref_clean_sub_poolabcd = add_genomic_element(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = add_genomic_element(ref_clean_sub_poolf)

In [1292]:
# Check for repeated spacer sequences in index file
print(sgrna_index_merged['protospacer_upper'].value_counts().loc[lambda x: x > 1])

protospacer_upper
CAGCCACGCGAGAGTAGAA    3
ATTGTCACGGCACATTCCA    2
CCTGCGGGCGGGACAGAGG    2
CAATGGTTAGGCTCTTACA    2
AACACCGGATGTGGGGGAG    2
                      ..
TGGGGAGGAAGCGGTTCTA    2
TGGTGCGACCACCACACCC    2
TGTCGGAGGACGAGGACCG    2
TTCCCGGTTCGCTCGGCCG    2
TTCCGCTCCAGGGAAGAGG    2
Name: count, Length: 92, dtype: int64


In [1293]:
# Remove multiple mappings from sgrna_index_merged
def deduplicate_index_file(df):
    def chrom_rank(chrom):
        if pd.isna(chrom):
            return 100
        if isinstance(chrom, str) and chrom.startswith("chr"):
            c = chrom[3:]
            if c.isdigit():
                return int(c)
            elif c == "X":
                return 23
            elif c == "Y":
                return 24
        return 100  # fallback
    
    df = df.copy()
    
    # Rank and sorting
    df["chrom_rank"] = df["chr_target"].map(chrom_rank)
    df["sort_key"] = (
        df["chrom_rank"].fillna(100) * 1e12 +
        df["chr_start_target"].fillna(0) * 1e6 +
        (df["chr_end_target"].fillna(0) - df["chr_start_target"].fillna(0))
    )
    
    # Group by spacer sequence
    grouped = df.groupby("protospacer_upper", group_keys=False)
    
    # Keep only groups where all key columns are the same across rows
    key_cols = [
        "chr_target", "chr_start_target", "chr_end_target",
        "chr_element", "chr_start_element", "chr_end_element"
    ]
    
    def is_consistent(group):
        return all(group[col].nunique(dropna=False) == 1 for col in key_cols)
    
    consistent_df = grouped.filter(is_consistent)
    
    # Deduplicate remaining consistent rows by keeping best ranked
    deduped_df = (
        consistent_df.sort_values("sort_key")
                     .drop_duplicates(subset="protospacer_upper", keep="first")
                     .drop(columns=["chrom_rank", "sort_key"])
    )
    
    return deduped_df

# Apply deduplication before merging
sgrna_index_merged = deduplicate_index_file(sgrna_index_merged)

In [1294]:
# Add the 'guide_chr', 'guide_start', and 'guide_end' values, which are given as 'chr_target', 'chr_start_target', 'chr_end_target', and 'strand'
def add_guide_coords(ref_clean_sub, sgrna_index_merged):
    ref_clean_sub = pd.merge(
        ref_clean_sub,
        sgrna_index_merged[['protospacer_upper', 'chr_target', 'chr_start_target', 'chr_end_target', 'strand']],
        left_on='spacer',
        right_on='protospacer_upper',
        how='left'
    )
    # Remove protospacer_upper column
    ref_clean_sub = ref_clean_sub.drop(columns=['protospacer_upper'])
    # Rename intended guide names
    ref_clean_sub.rename(columns={'chr_target': 'guide_chr', 
                                  'chr_start_target': 'guide_start',
                                  'chr_end_target': 'guide_end'},
                                  inplace=True)


    #print(ref_clean_sub.head())
    return ref_clean_sub


ref_clean_sub_poolabcd = add_guide_coords(ref_clean_sub_poolabcd, sgrna_index_merged)
print(ref_clean_sub_poolabcd.head())
#print(sgrna_index_merged[sgrna_index_merged['protospacer_upper'] == 'CGGCGACCCTAGGAGAGGT'])
#print(ref_clean_sub_poolabcd[ref_clean_sub_poolabcd['spacer'] == 'CGGCGACCCTAGGAGAGGT'])

# Columns are already correctly labeled for pool F
ref_clean_sub_poolf = pd.merge(
    ref_clean_sub_poolf,
    sgrna_index_poolf[['protospacer', 'guide_chr', 'guide_start', 'guide_end', 'strand']],
    left_on='spacer',
    right_on='protospacer',
    how='left'
)
print(ref_clean_sub_poolf.head())

                     guide_id               spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  CATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  TTTTTCCATTATAACTTGG       True   
3                     OR8B3-5  TTTTTGTCTTCAAAAATCT       True   
4  ZNF48_+_30406782.23-P1P2-1  CTCCGCGCCAAGCCGGGAG       True   

               type intended_target_name             label  pam  \
0         targeting                 TFEC      tf_targeting  NGG   
1         targeting                NR2C1      tf_targeting  NGG   
2         targeting                NANOG      tf_targeting  NGG   
3  negative control                OR8B3  negative_control  NGG   
4         targeting                ZNF48      tf_targeting  NGG   

  genomic_element guide_chr  guide_start    guide_end strand  
0        promoter      chr7  116030705.0  116030723.0      +  
1        promoter     chr12   95073493.0   95073511.0      +  
2        promoter

In [1295]:
# Add the intended_target_chr/intended_target_start/intended_target_end values, which are given as 'chr_element', 'chr_start_element', 'chr_end_element'
# Note that this refers to the element being targeted, not the gene itself
def add_element_coords(ref_clean_sub, sgrna_index_merged):
    ref_clean_sub = pd.merge(
        ref_clean_sub,
        sgrna_index_merged[['protospacer_upper', 'chr_element', 'chr_start_element', 'chr_end_element']],
        left_on='spacer',
        right_on='protospacer_upper',
        how='left'
    )
    # Remove protospacer_upper column
    ref_clean_sub = ref_clean_sub.drop(columns=['protospacer_upper'])
    # Rename intended target names
    ref_clean_sub.rename(columns={'chr_element': 'intended_target_chr', 
                                  'chr_start_element': 'intended_target_start',
                                  'chr_end_element': 'intended_target_end'},
                                  inplace=True)
    print(ref_clean_sub.head())
    return ref_clean_sub


ref_clean_sub_poolabcd = add_element_coords(ref_clean_sub_poolabcd, sgrna_index_merged)
#print(ref_clean_sub_poolabcd[ref_clean_sub_poolabcd['spacer'] == 'CGGCGACCCTAGGAGAGGT'])

ref_clean_sub_poolabcd.head()

# Columns are already correctly labeled for pool F
ref_clean_sub_poolf = pd.merge(
    ref_clean_sub_poolf,
    sgrna_index_poolf[['protospacer', 'intended_target_chr', 'intended_target_start', 'intended_target_end']],
    left_on='spacer',
    right_on='protospacer',
    how='left'
)

                     guide_id               spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  CATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  TTTTTCCATTATAACTTGG       True   
3                     OR8B3-5  TTTTTGTCTTCAAAAATCT       True   
4  ZNF48_+_30406782.23-P1P2-1  CTCCGCGCCAAGCCGGGAG       True   

               type intended_target_name             label  pam  \
0         targeting                 TFEC      tf_targeting  NGG   
1         targeting                NR2C1      tf_targeting  NGG   
2         targeting                NANOG      tf_targeting  NGG   
3  negative control                OR8B3  negative_control  NGG   
4         targeting                ZNF48      tf_targeting  NGG   

  genomic_element guide_chr  guide_start    guide_end strand  \
0        promoter      chr7  116030705.0  116030723.0      +   
1        promoter     chr12   95073493.0   95073511.0      +   
2        promo

In [1296]:
print(example_crispr_file.head())

# Reorganize columns to match
new_order = ['guide_id', 'spacer', 'targeting', 'type', 'guide_chr', 'guide_start', 'guide_end', 'strand', 'pam', 'intended_target_name', 'intended_target_chr', 'intended_target_start', 'intended_target_end', 'label', 'genomic_element']
ref_clean_sub_poolabcd = ref_clean_sub_poolabcd[new_order].drop_duplicates()
print(ref_clean_sub_poolabcd.head())
ref_clean_sub_poolf = ref_clean_sub_poolf[new_order].drop_duplicates()
print(ref_clean_sub_poolf.head())

controls_in_ref = ref_clean_sub_poolabcd[
    ref_clean_sub_poolabcd['spacer'].isin(non_targeting['Photospacer (same for all 3 sets)'])
    | ref_clean_sub_poolabcd['spacer'].isin(pos_controls['Photospacer (represent 10 times)'])
    | ref_clean_sub_poolabcd['spacer'].isin(neg_spacers)
]
print(len(controls_in_ref))
print(ref_clean_sub_poolabcd['label'].value_counts(dropna=False))
#controls_in_ref.sample(10)

   guide_id                spacer  targeting       type guide_chr  \
0  AFF4_sg1  CCAGCGGACGGGGCGGGGAC       True  targeting      chr5   
1  AFF4_sg2  CCGCCAGCGGACGGGGCGGC       True  targeting      chr5   
2  AFF4_sg3  CGTCCGCTGGCGGCGGCGAC       True  targeting      chr5   
3  AFF4_sg4  CTGCGTCAGTCACAGCCCTC       True  targeting      chr5   
4  AFF4_sg5  GCGGACGGGGCGGGGATCCC       True  targeting      chr5   

   guide_start    guide_end strand  pam intended_target_name  \
0  132299282.0  132299302.0      -  NGG                 AFF4   
1  132299282.0  132299302.0      -  NGG                 AFF4   
2  132299252.0  132299272.0      -  NGG                 AFF4   
3  132299279.0  132299299.0      -  NGG                 AFF4   
4  132299279.0  132299299.0      -  NGG                 AFF4   

  intended_target_chr  intended_target_start  intended_target_end  
0                chr5            132875395.0          132963634.0  
1                chr5            132875395.0          132963634.

In [1297]:
# Remove any rows with non-standard chromosomes (i.e. chr1, chr2, chrX, etc., not chrU) by making sure chr is not followed by a letter other than X or Y
control_types = ['non_targeting', 'negative_control', 'positive_control']

# Mask for control and targeting guides
is_control = ref_clean_sub_poolabcd['label'].str.lower().isin(control_types)
is_targeting = ~is_control

# Standard chromosome pattern
standard_chr_pattern = r'^chr(\d+|X|Y)$'

# Split, filter targeting only
controls_df = ref_clean_sub_poolabcd[is_control]
targets_df = ref_clean_sub_poolabcd[is_targeting]

filtered_targets_df = targets_df[
    targets_df['guide_chr'].notna()
    & targets_df['intended_target_chr'].notna()
    & targets_df['guide_chr'].str.match(standard_chr_pattern)
    & targets_df['intended_target_chr'].str.match(standard_chr_pattern)
]

# Recombine targets + controls
ref_clean_sub_poolabcd = pd.concat([filtered_targets_df, controls_df], ignore_index=True)
ref_clean_sub_poolf = pd.concat([ref_clean_sub_poolf, controls_df])

In [1298]:
# There are certain examples where a target has the same protospacer sequence but multiple local_target_start and local_target_end
# values, which throws errors with the pipeline. This takes the min/max of those values (dependent on strand) and collapses into a single row

# Collapse groups of targeting guides that have identical metadata but
# multiple start/end coordinates. For negative‑strand entries, we take the
# max(start) / min(end); for positive‑strand or mixed, min(start) / max(end).
# Control or non‑targeting rows are passed through unchanged.
def collapse_grouped_targets(df):

    def collapse_group(subdf):
        # Handle both guide_ and intended_target_ coordinates
        if (subdf["strand"] == "-").all():
            g_start = subdf["guide_start"].max()
            g_end   = subdf["guide_end"].min()
            t_start = subdf["intended_target_start"].max()
            t_end   = subdf["intended_target_end"].min()
        else:
            g_start = subdf["guide_start"].min()
            g_end   = subdf["guide_end"].max()
            t_start = subdf["intended_target_start"].min()
            t_end   = subdf["intended_target_end"].max()

        row = subdf.iloc[0].copy()
        row["guide_start"] = g_start
        row["guide_end"] = g_end
        row["intended_target_start"] = t_start
        row["intended_target_end"] = t_end
        return row

    # Identify control / non-targeting rows (pass through unchanged)
    is_control = (
        df["type"].str.contains("non", case=False, na=False)
        #| df["type"].str.contains("control", case=False, na=False)
    )

    controls_df = df[is_control].copy()
    targets_df  = df[~is_control].copy()

    # Exclude all coordinate columns from grouping
    coord_cols = [
        "guide_start", "guide_end",
        "intended_target_start", "intended_target_end"
    ]
    group_cols = [c for c in targets_df.columns if c not in coord_cols]

    collapsed_targets = (
        targets_df
        .groupby(group_cols, dropna=False)
        .apply(collapse_group)
        .reset_index(drop=True)
    )

    combined = pd.concat([collapsed_targets, controls_df], ignore_index=True)
    return combined

ref_clean_sub_poolabcd = collapse_grouped_targets(ref_clean_sub_poolabcd)
ref_clean_sub_poolf    = collapse_grouped_targets(ref_clean_sub_poolf)
print(ref_clean_sub_poolabcd.shape)
print(ref_clean_sub_poolf.shape)
print(ref_clean_sub_poolabcd['label'].value_counts(dropna=False))

  .apply(collapse_group)


(14172, 15)
(3862, 15)
label
tf_targeting        12934
negative_control      619
non_targeting         600
positive_control       19
Name: count, dtype: int64


  .apply(collapse_group)


In [1299]:
# Also write a version without mostly NA values, duplicate rows
#print(ref_clean_sub_poolabcd.shape)
#ref_clean_sub_poolabcd_clean = ref_clean_sub_poolabcd.dropna(thresh = (len(ref_clean_sub_poolabcd.columns)/2)).drop_duplicates()
#print(ref_clean_sub_poolabcd_clean.shape)
#ref_clean_sub_poolabcd_clean.to_csv(local_path + "harmonized_guide_file_poolabcd_nomissing.csv")

In [1300]:
# Interrogate duplicate spacers
duplicate_spacers_poolf = ref_clean_sub_poolf[ref_clean_sub_poolf["spacer"].duplicated(keep=False)]
print(duplicate_spacers_poolf["spacer"].value_counts())

duplicate_spacers_poolf_diff = duplicate_spacers_poolf.loc[:, duplicate_spacers_poolf.nunique() > 1]
print(duplicate_spacers_poolf_diff.head())
print(duplicate_spacers_poolf_diff.shape)

spacer
CGCCGGCGCGCCTGCGAGG    4
TCCTGCGATATCCAGGCGA    4
CTCCTTGCAGCCACCACGG    4
GCTCACGTCATCCCGACCG    4
CGACACTACCAGCTGCTGT    4
CAAATCCTCCTGTCTTTCG    4
CGTGCAAAACCCTGTGCCT    4
CAACTTGCCACTCAAACGC    2
GCCGGAGCTACCGGCAGCC    2
GCTCCGCCGCTCGGCCCCT    2
CACGTAACGGGACCACACA    2
GACGCCCCCGGCCAGGTGA    2
CACTTGCAGGGGCGCGAGG    2
GTCCTTCCCGTCGCCTGCA    2
AGTGAGGACTAACGGGGCA    2
GGAAACCGCCAGACACCAA    2
CACGCCAGACCACGACGGA    2
GCTCCACCCTTTCCGGGCG    2
Name: count, dtype: int64
    guide_id               spacer                guide_chr  guide_start  \
654    DGCR6  CGCCGGCGCGCCTGCGAGG                    chr22   18905981.0   
655    DGCR6  CGCCGGCGCGCCTGCGAGG                    chr22   18905981.0   
656    DGCR6  CGCCGGCGCGCCTGCGAGG  chr22_KI270734v1_random     131252.0   
657    DGCR6  CGCCGGCGCGCCTGCGAGG  chr22_KI270734v1_random     131252.0   
658  DGCR6.2  TCCTGCGATATCCAGGCGA                    chr22   18906057.0   

      guide_end strand intended_target_name      intended_target_c

In [1301]:
# Drop alternate contigs if a canonical chr exists (e.g. chr22_KI270731v1_random)
import re
def remove_random_contigs(df):
    df = df.copy()
    keep_rows = []

    canonical_pattern = re.compile(r"^chr(\d+|X|Y)$", re.IGNORECASE)

    for spacer, subdf in df.groupby("spacer", group_keys=False):
        has_main = (
            subdf["guide_chr"].astype(str).str.match(canonical_pattern).any() or
            subdf["intended_target_chr"].astype(str).str.match(canonical_pattern).any()
        )

        if has_main:
            mask = (
                subdf["guide_chr"].astype(str).str.match(canonical_pattern)
                & subdf["intended_target_chr"].astype(str).str.match(canonical_pattern)
            )
            keep_rows.append(subdf[mask])
        else:
            # If no canonical version exists, keep all
            keep_rows.append(subdf)

    cleaned = pd.concat(keep_rows, ignore_index=True)
    return cleaned

before = len(ref_clean_sub_poolf)
ref_clean_sub_poolf = remove_random_contigs(ref_clean_sub_poolf)
after = len(ref_clean_sub_poolf)

print(f"Removed {before - after} '_random' contig rows covered by canonical entries.")

duplicate_spacers_poolf = ref_clean_sub_poolf[ref_clean_sub_poolf["spacer"].duplicated(keep=False)]
print(duplicate_spacers_poolf["spacer"].value_counts())
print(duplicate_spacers_poolf.head(10))

duplicate_spacers_poolf_diff = duplicate_spacers_poolf.loc[:, duplicate_spacers_poolf.nunique() > 1]
print(duplicate_spacers_poolf_diff)
print(duplicate_spacers_poolf_diff.shape)

Removed 21 '_random' contig rows covered by canonical entries.
spacer
AGTGAGGACTAACGGGGCA    2
CAACTTGCCACTCAAACGC    2
CACGCCAGACCACGACGGA    2
CACGTAACGGGACCACACA    2
CACTTGCAGGGGCGCGAGG    2
GACGCCCCCGGCCAGGTGA    2
GCCGGAGCTACCGGCAGCC    2
GCTCCACCCTTTCCGGGCG    2
GCTCCGCCGCTCGGCCCCT    2
GGAAACCGCCAGACACCAA    2
GTCCTTCCCGTCGCCTGCA    2
Name: count, dtype: int64
           guide_id               spacer  targeting       type guide_chr  \
594         NCF1B.6  AGTGAGGACTAACGGGGCA       True  targeting      chr7   
595         NCF1B.6  AGTGAGGACTAACGGGGCA       True  targeting      chr7   
750        GATSL2.2  CAACTTGCCACTCAAACGC       True  targeting      chr7   
751        GATSL2.2  CAACTTGCCACTCAAACGC       True  targeting      chr7   
799       STAG3L2.3  CACGCCAGACCACGACGGA       True  targeting      chr7   
800       STAG3L2.3  CACGCCAGACCACGACGGA       True  targeting      chr7   
810  LOC100101148.2  CACGTAACGGGACCACACA       True  targeting      chr7   
811  LOC100101148.2  

In [1302]:
# Manually remove 11 strand mismatches in Pool F to select the most canonical one
canonical_strand_choices = {
    "AGTGAGGACTAACGGGGCA": "+",  # NCF1B.6
    "CAACTTGCCACTCAAACGC": "+",  # GATSL2.3
    "CACGCCAGACCACGACGGA": "-",  # STAG3L2.3
    "CACGTAACGGGACCACACA": "+",  # LOC100101148.2
    "CACTTGCAGGGGCGCGAGG": "+", # LOC541473.2
    "GACGCCCCCGGCCAGGTGA": "+",  # LOC100101148.6
    "GCCGGAGCTACCGGCAGCC": "-",  # GTF2IP1.2
    "GCTCCACCCTTTCCGGGCG": "-",  # STAG3L3.5
    "GCTCCGCCGCTCGGCCCCT": "-",  # GTF2IP1.9
    "GGAAACCGCCAGACACCAA": "-",  # STAG3L2.2
    "GTCCTTCCCGTCGCCTGCA": "+",  # NCF1B
}
mask_keep = pd.Series(True, index=ref_clean_sub_poolf.index)

for spacer, strand in canonical_strand_choices.items():
    # Identify all rows for this spacer
    idx_all = ref_clean_sub_poolf.index[ref_clean_sub_poolf["spacer"] == spacer]
    # Identify those on non-canonical strands
    idx_wrong = ref_clean_sub_poolf.index[
        (ref_clean_sub_poolf["spacer"] == spacer)
        & (ref_clean_sub_poolf["strand"] != strand)
    ]
    # Mark wrong-strand rows for removal
    mask_keep.loc[idx_wrong] = False
    print(f"Keeping {spacer} ({strand}), removing {len(idx_wrong)} opposite-strand rows")

# Apply mask
before = len(ref_clean_sub_poolf)
ref_clean_sub_poolf = ref_clean_sub_poolf.loc[mask_keep].reset_index(drop=True)
after = len(ref_clean_sub_poolf)

print(f"Removed {before - after} strand-mismatched rows.")
print(f"{ref_clean_sub_poolf['spacer'].duplicated().sum()} duplicate spacers remain.")

Keeping AGTGAGGACTAACGGGGCA (+), removing 1 opposite-strand rows
Keeping CAACTTGCCACTCAAACGC (+), removing 1 opposite-strand rows
Keeping CACGCCAGACCACGACGGA (-), removing 1 opposite-strand rows
Keeping CACGTAACGGGACCACACA (+), removing 1 opposite-strand rows
Keeping CACTTGCAGGGGCGCGAGG (+), removing 1 opposite-strand rows
Keeping GACGCCCCCGGCCAGGTGA (+), removing 1 opposite-strand rows
Keeping GCCGGAGCTACCGGCAGCC (-), removing 1 opposite-strand rows
Keeping GCTCCACCCTTTCCGGGCG (-), removing 1 opposite-strand rows
Keeping GCTCCGCCGCTCGGCCCCT (-), removing 1 opposite-strand rows
Keeping GGAAACCGCCAGACACCAA (-), removing 1 opposite-strand rows
Keeping GTCCTTCCCGTCGCCTGCA (+), removing 1 opposite-strand rows
Removed 11 strand-mismatched rows.
0 duplicate spacers remain.


In [1303]:
# Reformat the guide IDs 
# e.g. ARID1A_+_27022504.23-P1P2-1 --> ARID1A#chr1:26696017-26696035(-)
# new format: <target_name>#<guide_chr>:<guide_start>-<guide_end>(<strand>)
# Only for targeting guides
def reformat_guide_ids(df):
    df = df.copy()
    required_cols = ["intended_target_name", "guide_chr", "guide_start", "guide_end", "strand"]

    mask = (
        ~df["guide_id"].str.contains("non-targeting", case=False, na=False)
        & df[required_cols].notna().all(axis=1)
    )

    # Build new guide_id strings only for masked rows
    df["guide_start"] = df["guide_start"].astype("Int64")
    df["guide_end"] = df["guide_end"].astype("Int64")
    df.loc[mask, "guide_id"] = (
        df.loc[mask, "intended_target_name"].astype(str)
        + "#" + df.loc[mask, "guide_chr"].astype(str)
        + ":" + df.loc[mask, "guide_start"].astype(str)
        + "-" + df.loc[mask, "guide_end"].astype(str)
        + "(" + df.loc[mask, "strand"].astype(str) + ")"
    )

    return df

ref_clean_sub_poolabcd = reformat_guide_ids(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = reformat_guide_ids(ref_clean_sub_poolf)

print(ref_clean_sub_poolabcd.head())
print(ref_clean_sub_poolabcd['label'].value_counts(dropna=False))

                          guide_id               spacer  targeting  \
0  AARS#chr16:70289419-70289437(-)  CGGCGACCCTAGGAGAGGT       True   
1                           AARS_C  CCGCCCTCGGAGAGCTCTG       True   
2  AARS#chr16:70289477-70289495(-)  TCTGCGGGAATAGGTGCAG       True   
3  AATF#chr17:36948966-36948984(+)  AGTGGCCGGTCCAGAGCTG       True   
4  AATF#chr17:36949026-36949044(+)  GGATCAAGGCGAGAGGATC       True   

               type guide_chr  guide_start  guide_end strand  pam  \
0  positive control     chr16     70289419   70289437      -  NGG   
1  positive control     chrPC            0          0    NaN  NGG   
2  positive control     chr16     70289477   70289495      -  NGG   
3         targeting     chr17     36948966   36948984      +  NGG   
4         targeting     chr17     36949026   36949044      +  NGG   

  intended_target_name intended_target_chr  intended_target_start  \
0                 AARS               chr16             70289409.0   
1                 AARS    

In [1304]:
# For positive controls, replace '_' with '#' in guide ID
mask = ref_clean_sub_poolabcd["label"] == "positive_control"
ref_clean_sub_poolabcd.loc[mask, "guide_id"] = ref_clean_sub_poolabcd.loc[mask, "guide_id"].astype(str).str.replace("_", "#", regex=False)

mask = ref_clean_sub_poolf["label"] == "positive_control"
ref_clean_sub_poolf.loc[mask, "guide_id"] = ref_clean_sub_poolf.loc[mask, "guide_id"].astype(str).str.replace("_", "#", regex=False)

In [1305]:
# Supplement with coordinates from benchmarking annotation file for controls missing data
benchmark_annot = pd.read_csv(local_path + "benchmark_guide_metadata_v1 - benchmark_guide_metadata_v1.csv")
print(benchmark_annot.head())    

       guide_id               spacer  targeting       type guide_chr  \
0     CD81#weak  GAGAGCCAGCGCGCAACGG       True  targeting     chr11   
1  CD151#strong  CCGGACTCGGACGCGTGGT       True  targeting     chr11   
2    CD151#weak  CCGCTCGGCCGAGCTGTCG       True  targeting     chr11   
3   CD55#strong  CTGCGACTCGGCGGAGTCC       True  targeting      chr1   
4     NGFRAP1#B  GTTGGAGTTTGCCCTCCTC       True  targeting      chrX   

   guide_start    guide_end strand  PAM intended_target_name  \
0    2377315.0    2377333.0      +  NGG      ENSG00000110651   
1     833006.0     833024.0      +  NGG      ENSG00000177697   
2     833006.0     833024.0      +  NGG      ENSG00000177697   
3  207321714.0  207321732.0      +  NGG      ENSG00000196352   
4  103376258.0  103376279.0      -  NGG      ENSG00000166681   

  intended_target_chr  intended_target_start  intended_target_end gene_name  \
0               chr11              2377315.0            2377333.0      CD81   
1               chr11   

In [1306]:
def fill_controls_from_benchmark(main_df, benchmark_df):
    df = main_df.copy()
    bench = benchmark_df.copy()

    fill_cols = ["guide_chr", "guide_start", "guide_end", "strand", "intended_target_chr", "intended_target_start", "intended_target_end"]

    # Standardize spacers
    df["spacer_norm"] = df["spacer"].str.strip().str.upper()
    bench["spacer_norm"] = bench["spacer"].str.strip().str.upper()

    # Merge benchmark data
    merged = pd.merge(
        df,
        bench[["spacer_norm"] + fill_cols],
        on="spacer_norm",
        how="left",
        suffixes=("", "_bench"),
    )

    print("Matched rows with benchmark:", merged["guide_chr_bench"].notna().sum())

    # Fill only for controls
    is_control = merged["label"].isin(["positive_control", "negative_control"])

    for col in fill_cols:
        benchcol = f"{col}_bench"
        # Only fill where original is NaN and benchmark has a real value
        merged.loc[is_control & merged[col].isna() & merged[benchcol].notna(), col] = \
            merged.loc[is_control & merged[col].isna() & merged[benchcol].notna(), benchcol]

    # Clean up
    merged.drop(columns=[f"{c}_bench" for c in fill_cols] + ["spacer_norm"], inplace=True, errors="ignore")

    return merged

controls_with_nans = ref_clean_sub_poolabcd[
    ref_clean_sub_poolabcd["label"].isin(["positive_control", "negative_control"])
    & (
        ref_clean_sub_poolabcd["guide_chr"].isna()
        | ref_clean_sub_poolabcd["guide_start"].isna()
        | ref_clean_sub_poolabcd["guide_end"].isna()
    )
]
print("Controls missing before:", controls_with_nans.shape[0])
ref_clean_sub_poolabcd = fill_controls_from_benchmark(ref_clean_sub_poolabcd, benchmark_annot)

controls_with_nans = ref_clean_sub_poolabcd[
    ref_clean_sub_poolabcd["label"].isin(["positive_control", "negative_control"]) &
    (
        ref_clean_sub_poolabcd["guide_chr"].isna() |
        ref_clean_sub_poolabcd["guide_start"].isna() |
        ref_clean_sub_poolabcd["guide_end"].isna()
    )
]
print("Controls missing after :", controls_with_nans.shape[0])

Controls missing before: 127
Matched rows with benchmark: 294
Controls missing after : 127


In [1307]:
# Create a merged Pool ABCD and Pool F file
base = ref_clean_sub_poolabcd.copy()
targeting_from_f = ref_clean_sub_poolf[ref_clean_sub_poolf["label"] == "targeting"]

ref_clean_sub_poolabcdf = pd.concat([base, targeting_from_f], ignore_index=True)
ref_clean_sub_poolabcdf = ref_clean_sub_poolabcdf.drop_duplicates(subset=["spacer"], keep="first")

print(ref_clean_sub_poolabcdf.head())
print(ref_clean_sub_poolabcdf.shape)
print(ref_clean_sub_poolabcdf['label'].value_counts(dropna=False))

                          guide_id               spacer  targeting  \
0  AARS#chr16:70289419-70289437(-)  CGGCGACCCTAGGAGAGGT       True   
1                           AARS#C  CCGCCCTCGGAGAGCTCTG       True   
2  AARS#chr16:70289477-70289495(-)  TCTGCGGGAATAGGTGCAG       True   
3  AATF#chr17:36948966-36948984(+)  AGTGGCCGGTCCAGAGCTG       True   
4  AATF#chr17:36949026-36949044(+)  GGATCAAGGCGAGAGGATC       True   

               type guide_chr  guide_start  guide_end strand  pam  \
0  positive control     chr16     70289419   70289437      -  NGG   
1  positive control     chrPC            0          0    NaN  NGG   
2  positive control     chr16     70289477   70289495      -  NGG   
3         targeting     chr17     36948966   36948984      +  NGG   
4         targeting     chr17     36949026   36949044      +  NGG   

  intended_target_name intended_target_chr  intended_target_start  \
0                 AARS               chr16             70289409.0   
1                 AARS    

In [1308]:
# Another check for duplicates in the concatenated file
duplicate_spacers = ref_clean_sub_poolabcdf[ref_clean_sub_poolabcdf.duplicated(subset=['spacer'])]
#print(duplicate_spacers.head())
#print(duplicate_spacers.shape)

# Find the columns that are different between the duplicate spacers
duplicate_spacers_diff = duplicate_spacers.loc[:, duplicate_spacers.nunique() > 1]
#print(duplicate_spacers_diff.head())
#print(duplicate_spacers_diff.shape)

In [1309]:
# Fix any Excel-style gene names converted to dates
import re
month_gene_map = {
    "JAN": "JAN", "FEB": "FEB", "MAR": "MARCH", "APR": "APR",
    "MAY": "MAY", "JUN": "JUN", "JUL": "JUL", "AUG": "AUG",
    "SEP": "SEPT", "OCT": "OCT", "NOV": "NOV", "DEC": "DEC"
}

def fix_excel_date_genes(symbol):
    if not isinstance(symbol, str):
        return symbol  # leave NaN or other types alone

    m = re.match(r"^(\d{1,2})-([A-Za-z]{3})$", symbol.strip())
    if m:
        num, month = m.groups()
        month = month.upper()
        if month in month_gene_map:
            return f"{month_gene_map[month]}{num}"
    return symbol

ref_clean_sub_poolabcdf['intended_target_name'] = ref_clean_sub_poolabcdf['intended_target_name'].apply(fix_excel_date_genes)
ref_clean_sub_poolabcd['intended_target_name'] = ref_clean_sub_poolabcd['intended_target_name'].apply(fix_excel_date_genes)
ref_clean_sub_poolf['intended_target_name'] = ref_clean_sub_poolf['intended_target_name'].apply(fix_excel_date_genes)

print(ref_clean_sub_poolabcd['label'].value_counts(dropna=False))

label
tf_targeting        12934
negative_control      619
non_targeting         600
positive_control       19
Name: count, dtype: int64


In [1310]:
# Fix one-off instances of strange behavior

# Fix TBXT, which is mistakenly just 'T' in this data
def replace_T_with_TBXT(df):
    df = df.copy()
    df["guide_id"] = df["guide_id"].str.replace(
        r"^T#", "TBXT#", regex=True
    )
    return df

ref_clean_sub_poolabcdf = replace_T_with_TBXT(ref_clean_sub_poolabcdf)
ref_clean_sub_poolabcd = replace_T_with_TBXT(ref_clean_sub_poolabcd)

In [1311]:
# Sort to put controls at the top of the file
control_order = ["positive_control", "negative_control", "non_targeting"]
ref_clean_sub_poolabcdf["label"] = pd.Categorical(ref_clean_sub_poolabcdf["label"], categories=control_order + ["tf_targeting"], ordered=True)
ref_clean_sub_poolabcd["label"] = pd.Categorical(ref_clean_sub_poolabcd["label"], categories=control_order + ["tf_targeting"], ordered=True)
ref_clean_sub_poolf["label"] = pd.Categorical(ref_clean_sub_poolf["label"], categories=control_order + ["tf_targeting"], ordered=True)

ref_clean_sub_poolabcdf = ref_clean_sub_poolabcdf.sort_values(by=["label", "guide_id"], ascending=[True, True]).reset_index(drop=True)
ref_clean_sub_poolabcd = ref_clean_sub_poolabcd.sort_values(by=["label", "guide_id"], ascending=[True, True]).reset_index(drop=True)
ref_clean_sub_poolf = ref_clean_sub_poolf.sort_values(by=["label", "guide_id"], ascending=[True, True]).reset_index(drop=True)
print(ref_clean_sub_poolabcd['label'].value_counts(dropna=False))

label
tf_targeting        12934
negative_control      619
non_targeting         600
positive_control       19
Name: count, dtype: int64


In [1312]:
# Add empty columns for putative_target_genes, reporter, and imperfect
def add_placeholder_cols(ref_clean_sub):
    ref_clean_sub['putative_target_genes'] = np.nan
    ref_clean_sub['reporter'] = np.nan
    ref_clean_sub['imperfect'] = np.nan
    return ref_clean_sub

ref_clean_sub_poolabcdf = add_placeholder_cols(ref_clean_sub_poolabcdf)
ref_clean_sub_poolabcd = add_placeholder_cols(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = add_placeholder_cols(ref_clean_sub_poolf)
print(ref_clean_sub_poolabcdf.columns)

Index(['guide_id', 'spacer', 'targeting', 'type', 'guide_chr', 'guide_start',
       'guide_end', 'strand', 'pam', 'intended_target_name',
       'intended_target_chr', 'intended_target_start', 'intended_target_end',
       'label', 'genomic_element', 'putative_target_genes', 'reporter',
       'imperfect'],
      dtype='object')


In [1313]:
# Write to file
ref_clean_sub_poolabcd.to_csv(local_path + "harmonized_guide_file_poolabcd.csv", index=False)
ref_clean_sub_poolabcd.to_csv(
    local_path + "harmonized_guide_file_poolabcd.tsv",
    sep='\t',
    index=False,
    lineterminator='\n'
)
ref_clean_sub_poolf.to_csv(local_path + "harmonized_guide_file_poolf.csv", index=False)
ref_clean_sub_poolf.to_csv(
    local_path + "harmonized_guide_file_poolf.tsv",
    sep='\t',
    index=False,
    lineterminator='\n'
)

In [1314]:
ref_clean_sub_poolabcdf.to_csv(local_path + "harmonized_guide_file_poolabcdf.csv", index=False)

# Write to tsv file, including header
ref_clean_sub_poolabcdf.to_csv(
    local_path + "harmonized_guide_file_poolabcdf.tsv",
    sep='\t',
    index=False,
    lineterminator='\n'
)

---

In [1315]:
#%pip install pybiomart

In [1316]:
# Convert intended_target_name to Ensembl ID using pyBiomart
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')

# Fetch mapping
mapping = dataset.query(attributes=['hgnc_symbol', 'ensembl_gene_id', 'external_synonym'])
mapping.columns = ['intended_target_name', 'ensembl_gene_id', 'external_synonym']
mapping.head()

Unnamed: 0,intended_target_name,ensembl_gene_id,external_synonym
0,MT-TF,ENSG00000210049,MTTF
1,MT-TF,ENSG00000210049,TRNF
2,MT-RNR1,ENSG00000211459,12S
3,MT-RNR1,ENSG00000211459,MOTS-C
4,MT-RNR1,ENSG00000211459,MTRNR1


In [1317]:
# Combine HGNC symbol and synonyms into a single mapping dataframe
# Melt external_synonym if it's a comma-separated list
mapping_expanded = mapping.copy()
mapping_expanded['external_synonym'] = mapping_expanded['external_synonym'].fillna('')
mapping_expanded = mapping_expanded.assign(
    synonym_list=mapping_expanded['external_synonym'].str.split(',')
).explode('synonym_list')
mapping_expanded.head()

Unnamed: 0,intended_target_name,ensembl_gene_id,external_synonym,synonym_list
0,MT-TF,ENSG00000210049,MTTF,MTTF
1,MT-TF,ENSG00000210049,TRNF,TRNF
2,MT-RNR1,ENSG00000211459,12S,12S
3,MT-RNR1,ENSG00000211459,MOTS-C,MOTS-C
4,MT-RNR1,ENSG00000211459,MTRNR1,MTRNR1


In [1318]:
# Combine intended_target_name and synonym_list into one lookup table
lookup_biomart = pd.concat([
    mapping_expanded[['intended_target_name', 'ensembl_gene_id']].rename(columns={'intended_target_name': 'symbol'}),
    mapping_expanded[['synonym_list', 'ensembl_gene_id']].rename(columns={'synonym_list': 'symbol'})
]).drop_duplicates()
#print(lookup.head())
lookup_biomart['symbol'] = lookup_biomart['symbol'].apply(lambda x: str(x).upper().replace('-', '').replace('_',''))
lookup_biomart = lookup_biomart.drop_duplicates(subset=["symbol"], keep="first")
print(lookup_biomart.head())

    symbol  ensembl_gene_id
0     MTTF  ENSG00000210049
2   MTRNR1  ENSG00000211459
5     MTTV  ENSG00000210077
7   MTRNR2  ENSG00000210082
10   MTTL1  ENSG00000209082


In [1319]:
def clean_symbol(s):
    s = str(s).upper()
    # Remove dashes for relaxed matching
    s = s.replace('-', '').replace('_', '')
    return s

In [1320]:
# Merge with data frame and replace intended_target_name with Ensembl IDs
def replace_w_ensembl(ref_clean, mapping):
    # Make all symbols uppercase for matching
    ref_clean = ref_clean.copy()
    ref_clean['intended_target_name'] = ref_clean['intended_target_name'].apply(clean_symbol)

    mapping = mapping.copy()
    mapping['symbol'] = mapping['symbol'].apply(clean_symbol)

    # Merge by symbol
    ref_clean = ref_clean.merge(mapping, left_on='intended_target_name',
                                right_on='symbol', how='left')

    # Identify missing mappings
    missing_mask = ref_clean['ensembl_gene_id'].isna()
    num_missing = missing_mask.sum()
    missing_genes = ref_clean.loc[missing_mask, 'intended_target_name'].unique()

    # Print summary
    print(f"Number of rows with missing Ensembl mapping: {num_missing}")
    print(f"Gene symbols with no mapping:\n{missing_genes}")

    mapped = ref_clean['ensembl_gene_id'].notna()
    gtf_mapped = (mapped & (ref_clean['source'] == 'gtf')).sum()
    biomart_mapped = (mapped & (ref_clean['source'] == 'biomart')).sum()
    unmapped = (~mapped).sum()
    print(f"Mapped via GTF: {gtf_mapped}")
    print(f"Mapped via Biomart only: {biomart_mapped}")
    print(f"Unmapped: {unmapped}")

    # Keep original gene name in a new column
    ref_clean['gene_name'] = ref_clean['intended_target_name']

    # Replace intended_target_name with Ensembl ID where available,
    # otherwise keep the original gene name
    ref_clean['intended_target_name'] = ref_clean.apply(
        lambda row: row['ensembl_gene_id'] if pd.notna(row['ensembl_gene_id']) 
                    else row['intended_target_name'],
        axis=1
    )

    # Identify duplicates of ENSG IDs
    grouped = ref_clean.groupby('intended_target_name')['gene_name'].nunique()
    duplicate_ensg_distinct = grouped[grouped > 1].index
    
    # Append gene name only for ENSG IDs with distinct gene names
    mask = ref_clean['intended_target_name'].isin(duplicate_ensg_distinct) & pd.notna(ref_clean['intended_target_name'])
    ref_clean.loc[mask, 'intended_target_name'] = ref_clean.loc[mask].apply(
        lambda row: f"{row['intended_target_name']}#{row['gene_name']}", axis=1
    )

    # Drop temp columns if you don’t need them later
    ref_clean.drop(columns=['ensembl_gene_id', 'symbol'], inplace=True, errors='ignore')

    desired_order = [
        'guide_id','spacer','targeting','type','guide_chr','guide_start','guide_end','strand','pam',
        'genomic_element','intended_target_name','intended_target_chr','intended_target_start',
        'intended_target_end','putative_target_genes','reporter','imperfect','gene_name','label'
    ]
    existing_order = [c for c in desired_order if c in ref_clean.columns]
    ref_clean = ref_clean[existing_order]

    return ref_clean

#ref_clean_sub_poolabcdf_ensembl = replace_w_ensembl(ref_clean_sub_poolabcdf, lookup_biomart)
#ref_clean_sub_poolabcd_ensembl = replace_w_ensembl(ref_clean_sub_poolabcd, lookup_biomart)
#ref_clean_sub_poolf_ensembl = replace_w_ensembl(ref_clean_sub_poolf, lookup_biomart)

In [1321]:
# Alternatively, derive ENSG IDs from IGVF GTF file
gtf_cols = [
    "seqname", "source", "feature", "start", "end",
    "score", "strand", "frame", "attribute"
]

gtf = pd.read_csv(
    local_path + "gencode.v43.chr_patch_hapl_scaff.annotation.gtf",
    sep="\t",
    comment="#",
    names=gtf_cols,
    dtype={"seqname": str}
)
gtf.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,chr1,HAVANA,gene,11869,14409,.,+,.,"gene_id ""ENSG00000290825.1""; gene_type ""lncRNA..."
1,chr1,HAVANA,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
2,chr1,HAVANA,exon,11869,12227,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12613,12721,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
4,chr1,HAVANA,exon,13221,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."


In [1322]:
genes = gtf[gtf["feature"] == "gene"].copy()

def extract_attr(attr, key):
    match = re.search(fr'{key} "([^"]+)"', attr)
    return match.group(1) if match else None


genes["ensembl_gene_id"] = genes["attribute"].apply(
    lambda x: extract_attr(x, "gene_id")
)
genes["intended_target_name"] = genes["attribute"].apply(
    lambda x: extract_attr(x, "gene_name")
)

genes["external_synonym"] = genes["attribute"].apply(
    lambda x: extract_attr(x, "gene_synonym")
)

# Strip version numbers from ENSG IDs
genes["ensembl_gene_id"] = genes["ensembl_gene_id"].str.split(".").str[0]
genes.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute,ensembl_gene_id,intended_target_name,external_synonym
0,chr1,HAVANA,gene,11869,14409,.,+,.,"gene_id ""ENSG00000290825.1""; gene_type ""lncRNA...",ENSG00000290825,DDX11L2,
5,chr1,HAVANA,gene,12010,13670,.,+,.,"gene_id ""ENSG00000223972.6""; gene_type ""transc...",ENSG00000223972,DDX11L1,
13,chr1,HAVANA,gene,14404,29570,.,-,.,"gene_id ""ENSG00000227232.5""; gene_type ""unproc...",ENSG00000227232,WASH7P,
26,chr1,ENSEMBL,gene,17369,17436,.,-,.,"gene_id ""ENSG00000278267.1""; gene_type ""miRNA""...",ENSG00000278267,MIR6859-1,
29,chr1,HAVANA,gene,29554,31109,.,+,.,"gene_id ""ENSG00000243485.5""; gene_type ""lncRNA...",ENSG00000243485,MIR1302-2HG,


In [1323]:
# Build GTF-based lookup table
mapping = genes[["intended_target_name", "ensembl_gene_id", "external_synonym"]]
mapping_expanded = mapping.copy()
mapping_expanded["external_synonym"] = mapping_expanded["external_synonym"].fillna("")
mapping_expanded = mapping_expanded.assign(
    synonym_list=mapping_expanded["external_synonym"].str.split(",")
).explode("synonym_list")
lookup = pd.concat([
    mapping_expanded[["intended_target_name", "ensembl_gene_id"]]
        .rename(columns={"intended_target_name": "symbol"}),
    mapping_expanded[["synonym_list", "ensembl_gene_id"]]
        .rename(columns={"synonym_list": "symbol"})
]).drop_duplicates()
lookup["symbol"] = (
    lookup["symbol"]
    .astype(str)
    .str.upper()
    .str.replace("-", "", regex=False)
    .str.replace("_", "", regex=False)
)

lookup = lookup.drop_duplicates(subset=["symbol"], keep="first")
lookup.head()

Unnamed: 0,symbol,ensembl_gene_id
0,DDX11L2,ENSG00000290825
5,DDX11L1,ENSG00000223972
13,WASH7P,ENSG00000227232
26,MIR68591,ENSG00000278267
29,MIR13022HG,ENSG00000243485


In [1324]:
# Add ENSG IDs for missing genes from Biomart
def supplement_lookup(lookup_gtf, lookup_biomart):
    lookup_gtf = lookup_gtf.copy()
    lookup_biomart = lookup_biomart.copy()
    
    lookup_gtf["source"] = "gtf"
    lookup_biomart["source"] = "biomart"
    
    # Identify symbols missing from GTF
    missing_symbols = (set(lookup_biomart["symbol"]) - set(lookup_gtf["symbol"]))
    biomart_missing = lookup_biomart[lookup_biomart["symbol"].isin(missing_symbols)]
    print(f"Added from Biomart: {biomart_missing.shape[0]}")
    
    # Combine, prioritizing GTF
    combined = pd.concat([lookup_gtf, biomart_missing], ignore_index=True)
    
    # Final safety de-duplication
    combined = combined.drop_duplicates(subset=["symbol"], keep="first")
    
    return combined

print(f"Total symbols (GTF): {lookup.shape[0]}")
lookup_full = supplement_lookup(lookup, lookup_biomart)

Total symbols (GTF): 62797
Added from Biomart: 53895


In [1325]:
# Call Ensembl replacement function with new GTF-based lookup table
ref_clean_sub_poolabcdf_ensembl = replace_w_ensembl(ref_clean_sub_poolabcdf, lookup_full)
ref_clean_sub_poolabcd_ensembl = replace_w_ensembl(ref_clean_sub_poolabcd, lookup_full)
ref_clean_sub_poolf_ensembl = replace_w_ensembl(ref_clean_sub_poolf, lookup_full)

Number of rows with missing Ensembl mapping: 600
Gene symbols with no mapping:
['NONTARGETING']
Mapped via GTF: 13335
Mapped via Biomart only: 237
Unmapped: 600
Number of rows with missing Ensembl mapping: 600
Gene symbols with no mapping:
['NONTARGETING']
Mapped via GTF: 13335
Mapped via Biomart only: 237
Unmapped: 600
Number of rows with missing Ensembl mapping: 706
Gene symbols with no mapping:
['NONTARGETING' 'CTD2515O10' 'CTD2574D22' 'GREGOR' 'LOC100101148'
 'LOC100133091' 'LOC101926943' 'LOC284865' 'LOC388849' 'LOC541473'
 'SEPT5GP1BB' 'XXBACB562F10']
Mapped via GTF: 2990
Mapped via Biomart only: 134
Unmapped: 706


In [1326]:
# Write to file
ref_clean_sub_poolabcd_ensembl.to_csv(local_path + "harmonized_guide_file_poolabcd_ensg.csv", index=False)
ref_clean_sub_poolabcd_ensembl.to_csv(
    local_path + "harmonized_guide_file_poolabcd_ensg.tsv",
    sep='\t',
    index=False,
    lineterminator='\n'
)
ref_clean_sub_poolf_ensembl.to_csv(local_path + "harmonized_guide_file_poolf_ensg.csv", index=False)
ref_clean_sub_poolf_ensembl.to_csv(
    local_path + "harmonized_guide_file_poolf_ensg.tsv",
    sep='\t',
    index=False,
    lineterminator='\n'
)
ref_clean_sub_poolabcdf_ensembl.to_csv(local_path + "harmonized_guide_file_poolabcdf_ensg.csv", index=False)

# Write to tsv file, including header
ref_clean_sub_poolabcdf_ensembl.to_csv(
    local_path + "harmonized_guide_file_poolabcdf_ensg.tsv",
    sep='\t',
    index=False,
    lineterminator='\n'
)

In [1327]:
# Quick unit tests to make sure everything is kosher
def run_integrity_checks(df, pool_label=""):
    print(f"\nRunning integrity checks for {pool_label}")

    # Check for duplicate spacers
    duplicates = df[df["spacer"].duplicated()]
    assert len(duplicates) == 0, f"{len(duplicates)} duplicate spacers found in {pool_label}"

    # Check NA values are np.nan (not 'NA', 'None', or empty strings)
    bad_na = df.isin(["NA", "None", ""]).any().sum()
    assert bad_na == 0, f"{bad_na} non-numeric NA placeholders found in {pool_label}"

    # Check for strange characters in spacer or guide_id
    pattern_ok = re.compile(r"^[ACGTN]+$", re.IGNORECASE)
    bad_spacers = df[~df["spacer"].astype(str).str.match(pattern_ok)]
    assert len(bad_spacers) == 0, f"Unexpected characters in {len(bad_spacers)} spacers"

    # Confirm control guides are present
    control_types = ["non_targeting", "positive_control", "negative_control"]
    found_controls = {ct: (df["label"].str.lower() == ct).sum() for ct in control_types}
    missing_controls = [ct for ct, count in found_controls.items() if count == 0]
    assert not missing_controls, f"Missing control types: {missing_controls}"

    # Confirm coordinate columns are numeric or np.nan
    coord_cols = ["guide_start", "guide_end", "intended_target_start", "intended_target_end"]
    for col in coord_cols:
        if col in df.columns:
            bad_coords = df[col].dropna().apply(lambda x: isinstance(x, (int, float)))
            assert bad_coords.all(), f"Non-numeric entries in {col}"

    # Confirm chromosome format (allow chr1–22, chrX/Y, and *_random)
    chr_cols = ["guide_chr", "intended_target_chr"]
    chr_pattern = re.compile(r"^chr(\d+|X|Y)(_.*_random)?$", re.IGNORECASE)
    for col in chr_cols:
        if col in df.columns:
            # Convert all entries to string once
            chr_values = df[col].astype(str)
    
            # Pick out the 'chrPC' rows
            pc_mask = chr_values.str.upper() == "CHRPC"
            if pc_mask.any():
                print(f"Note: {pc_mask.sum()} rows in {col} are labeled 'chrPC' (placeholder coordinates).")
    
            # Normal validity check, ignoring NaNs and chrPC
            bad_mask = (
                df[col].notna()
                & ~pc_mask
                & ~chr_values.str.match(chr_pattern)
            )
    
            if bad_mask.any():
                print(f"\nInvalid chromosome values found in {col} for {pool_label}:")
                print(df.loc[bad_mask, [col, "guide_id"]].head(10))
                print(df.loc[bad_mask, col].value_counts().head(20))
    
            assert not bad_mask.any(), f"Invalid chromosome names in {col}"

    print(f"All checks passed for {pool_label}")

run_integrity_checks(ref_clean_sub_poolabcd_ensembl, "ABCD")
run_integrity_checks(ref_clean_sub_poolf_ensembl, "F")
run_integrity_checks(ref_clean_sub_poolabcdf_ensembl, "ABCDF")


Running integrity checks for ABCD
Note: 6 rows in guide_chr are labeled 'chrPC' (placeholder coordinates).
Note: 6 rows in intended_target_chr are labeled 'chrPC' (placeholder coordinates).
All checks passed for ABCD

Running integrity checks for F
Note: 6 rows in guide_chr are labeled 'chrPC' (placeholder coordinates).
Note: 6 rows in intended_target_chr are labeled 'chrPC' (placeholder coordinates).
All checks passed for F

Running integrity checks for ABCDF
Note: 6 rows in guide_chr are labeled 'chrPC' (placeholder coordinates).
Note: 6 rows in intended_target_chr are labeled 'chrPC' (placeholder coordinates).
All checks passed for ABCDF
