# Create harmonized sgRNA guide annotation file for use with the CRISPR pipeline (2025)
This notebook describes the creation of a unified annotation file from the guide annotation files provided by the Hon, Huangfu, and Gersbach labs, according to the specification described in: https://github.com/pinellolab/CRISPR_Pipeline/blob/main/example_data/guide_metadata.tsv 

# Install libraries and set paths

In [2]:
%pip install pandas
%pip install matplotlib
%pip install numpy
%pip install seaborn
%pip install biomart

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting biomart
  Downloading biomart-0.9.2-py3-none-any.whl.metadata (3.3 kB)
Collecting requests>=2.2 (from biomart)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests>=2.2->biomart)
  Downloading charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests>=2.2->biomart)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.2->biomart)
  Downloading urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.2->biomart)
  Downloading certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Downloading biomart-0.9.2-py3-none-any.

In [3]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [4]:
# Paths: TODO update if necessary
#local_path = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/"
#local_path = "C:/Users/seg95/Documents/tf_perturb_seq/"
local_path = "D:/tf_perturb_seq/"

# Import merged guide reference file, along with guide index file

In [90]:
# Merged guide ref file
merged_guide_file = pd.read_csv(local_path + "outer_merged_file.csv")
merged_guide_file.head()

merged_guide_file_poolabcd = pd.read_csv(local_path + "outer_merged_file_poolabcd.csv")
merged_guide_file_poolf = pd.read_csv(local_path + "outer_merged_file_poolf.csv")

In [165]:
# sgRNA index files
sgrna_index_poolabcd = pd.read_csv(local_path + "sgRNA_index_v0.csv", sep = "\t")
sgrna_index_poolf = pd.read_csv(local_path + "igvf_poolF_annotation.csv", sep = "\t")

sgrna_index_dacc_annot = pd.read_csv(local_path + "sgRNA_index_dacc_annot_reference.csv", sep = "\t")
print(len(set(sgrna_index_dacc_annot['protospacer']).intersection(set(merged_guide_file_poolabcd['protospacer']))))

def adjust_index_file(sgrna_index, name_sgrna_seq = 'sgRNA_seq', add_leading_G = True):
    if(name_sgrna_seq == "sgRNA_seq"):
        sgrna_index['strand'] = sgrna_index['target_loc'].str.extract(r'\((\+|\-)\)')
        sgrna_index['oligo'] = sgrna_index['oligo'].str.upper()
    else:
        sgrna_index['oligo_sequence'] = sgrna_index['oligo_sequence'].str.upper()
    sgrna_index[name_sgrna_seq] = sgrna_index[name_sgrna_seq].str.upper()
    # Adjust the index file to add leading Gs if needed
    if(add_leading_G):
        sgrna_index[name_sgrna_seq] = 'G' + sgrna_index[name_sgrna_seq]
    return sgrna_index

sgrna_index_poolabcd = adjust_index_file(sgrna_index_poolabcd)
sgrna_index_poolf = adjust_index_file(sgrna_index_poolf, name_sgrna_seq= 'protospacer', add_leading_G = False)

sgrna_index_dacc_annot['protospacer'] = sgrna_index_dacc_annot['protospacer'].str.upper()
sgrna_index_poolf['protospacer'] = sgrna_index_poolf['protospacer'].str.upper()
#sgrna_index_dacc_annot['protospacer'] = [s[1:] if len(s) > 0 else s for s in sgrna_index_dacc_annot['protospacer']]
#sgrna_index_dacc_annot['reverse_compliment'] = sgrna_index_dacc_annot['reverse_compliment'].str.rstrip('C')

13188


In [166]:
# Add a reverse compliment if needed
def reverse_compliment(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return "".join(complement.get(base, base) for base in reversed(sequence.upper()))

sgrna_index_poolabcd['reverse_compliment'] = sgrna_index_poolabcd['sgRNA_seq'].apply(reverse_compliment)
sgrna_index_poolf.rename(columns={"antisense_sequence": "reverse_compliment"})

print("Index:")
print(sgrna_index_poolabcd.head())
print(sgrna_index_poolf.head())
print("Annot:")
print(sgrna_index_dacc_annot.head())

Index:
                   target_loc              element_seq     target source  \
0  chr17:36948966-36948984(+)  chr17:36948966-36949088  AATF_P1P2     TF   
1  chr17:36949026-36949044(+)  chr17:36948966-36949088  AATF_P1P2     TF   
2  chr17:36949013-36949031(+)  chr17:36948966-36949088  AATF_P1P2     TF   
3  chr17:36949070-36949088(-)  chr17:36948966-36949088  AATF_P1P2     TF   
4  chr17:36949031-36949049(+)  chr17:36948966-36949088  AATF_P1P2     TF   

              sgRNA_seq                                              oligo  \
0  GAGTGGCCGGTCCAGAGCTG  GTGGAAAGGACGAAACACCGAGTGGCCGGTCCAGAGCTGGTTTAAG...   
1  GGGATCAAGGCGAGAGGATC  GTGGAAAGGACGAAACACCGGGATCAAGGCGAGAGGATCGTTTAAG...   
2  GGAGTCGGGGAATCGGATCA  GTGGAAAGGACGAAACACCGGAGTCGGGGAATCGGATCAGTTTAAG...   
3  GAAATGTGCGGCCCAACCCC  GTGGAAAGGACGAAACACCGAAATGTGCGGCCCAACCCCGTTTAAG...   
4  GAAGGCGAGAGGATCCGGCA  GTGGAAAGGACGAAACACCGAAGGCGAGAGGATCCGGCAGTTTAAG...   

  gene_target chr_target  chr_start_target  chr_end_target chr_elem

In [167]:
sgrna_index_dacc_annot["protospacer_upper"] = sgrna_index_dacc_annot["protospacer"].str.upper() 

print(len(set(sgrna_index_poolabcd['sgRNA_seq']).intersection(sgrna_index_dacc_annot['protospacer_upper'])))

13470


In [168]:
# Merge pool A-D index and DACC files into one; pool F file has sufficient info for matching
sgrna_index_merged = pd.merge(
    sgrna_index_dacc_annot,
    sgrna_index_poolabcd,
    left_on=['protospacer_upper', 'reverse_compliment'],
    right_on=['sgRNA_seq', 'reverse_compliment'],
    how="outer"
)
print(sgrna_index_merged.head())
print(sgrna_index_merged.shape)

               protospacer_ID           protospacer intended_target_name  \
0                     OR5K2-2  GAAAAAATTGTAGAGGAATA                OR5K2   
1    SP1_+_53773993.23-P1P2-1  GAAAAACGCGGACGCTGACG                  SP1   
2    SP8_-_20826141.23-P1P2-2  GAAAAAGATCCTCTGAGAGG                  SP8   
3    FOXN3_-_89883583.23-P2-1  GAAAAAGGCGACACATGACC                FOXN3   
4  ZNF85_+_21106076.23-P1P2-1  GAAAACAAGACCTAGAGCTC                ZNF85   

        type genomic_element    reverse_compliment     protospacer_upper  \
0  targeting        promoter  TATTCCTCTACAATTTTTTC  GAAAAAATTGTAGAGGAATA   
1  targeting        promoter  CGTCAGCGTCCGCGTTTTTC  GAAAAACGCGGACGCTGACG   
2  targeting        promoter  CCTCTCAGAGGATCTTTTTC  GAAAAAGATCCTCTGAGAGG   
3  targeting        promoter  GGTCATGTGTCGCCTTTTTC  GAAAAAGGCGACACATGACC   
4  targeting        promoter  GAGCTCTAGGTCTTGTTTTC  GAAAACAAGACCTAGAGCTC   

                   target_loc              element_seq      target  ...  \
0          

# Reformat to resemble input to the CRISPR pipeline

In [9]:
# Import example file for the CRISPR pipeline
example_crispr_file = pd.read_csv(local_path + "crispr_annot_sample.tsv", sep = "\t")

In [10]:
example_crispr_file.head()

Unnamed: 0,guide_id,spacer,targeting,type,guide_chr,guide_start,guide_end,strand,pam,intended_target_name,intended_target_chr,intended_target_start,intended_target_end
0,AFF4_sg1,CCAGCGGACGGGGCGGGGAC,True,targeting,chr5,132299282.0,132299302.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
1,AFF4_sg2,CCGCCAGCGGACGGGGCGGC,True,targeting,chr5,132299282.0,132299302.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
2,AFF4_sg3,CGTCCGCTGGCGGCGGCGAC,True,targeting,chr5,132299252.0,132299272.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
3,AFF4_sg4,CTGCGTCAGTCACAGCCCTC,True,targeting,chr5,132299279.0,132299299.0,-,NGG,AFF4,chr5,132875395.0,132963634.0
4,AFF4_sg5,GCGGACGGGGCGGGGATCCC,True,targeting,chr5,132299279.0,132299299.0,-,NGG,AFF4,chr5,132875395.0,132963634.0


In [169]:
# Keep only necessary columns and reorder them to match 
def prune_and_rename_cols(merged_guide_file, is_pool_f=False):
        ref_clean_sub = merged_guide_file[['protospacer', 'type', 'reverse_compliment']].copy()
        if(is_pool_f):
                ref_clean_sub["guide_id"] = (
                        merged_guide_file['id_gersbach'].combine_first(merged_guide_file['id_engreitz'])
                )
                ref_clean_sub["intended_target_name"] = (
                        merged_guide_file['intended_target_name_gersbach'].combine_first(merged_guide_file['intended_target_name_engreitz'])
                )
        else:
                ref_clean_sub["guide_id"] = (
                        merged_guide_file['id_hon']
                        .combine_first(merged_guide_file['id_gersbach'])
                        .combine_first(merged_guide_file['id_engreitz'])
                        .combine_first(merged_guide_file['id_huangfu'])
                )
                ref_clean_sub["intended_target_name"] = (
                        merged_guide_file['intended_target_name_hon']
                        .combine_first(merged_guide_file['intended_target_name_gersbach'])
                        .combine_first(merged_guide_file['intended_target_name_engreitz'])
                        .combine_first(merged_guide_file['intended_target_name_huangfu'])
                )
                #print(ref_clean_sub.head())

        # For each, use only one ID - use Hon lab as a reference, but fill will Gersbach/Engreitz where values are missing
        ref_clean_sub = ref_clean_sub.rename(columns={'protospacer': 'spacer'})
        ref_clean_sub = ref_clean_sub[['guide_id', 'spacer', 'type', 'intended_target_name', 'reverse_compliment']]
        print(ref_clean_sub.head())
        return ref_clean_sub

# Call function
ref_clean_sub = prune_and_rename_cols(merged_guide_file)
ref_clean_sub_poolabcd = prune_and_rename_cols(merged_guide_file_poolabcd)
ref_clean_sub_poolf = prune_and_rename_cols(merged_guide_file_poolf, is_pool_f=True)

                       guide_id                spacer       type  \
0    FOXN1_-_26833391.23-P1P2-1  GCACAGGACGGCCGAGCTGA  targeting   
1     EN2_-_155251011.23-P1P2-1  GCTCCGTGTGCGCCGCGGGA  targeting   
2  BCLAF1_-_136610510.23-P1P2-2  GCTCCGTTGCAACCACACAG  targeting   
3      KLF6_-_3827130.23-P1P2-2  GCTGGAGGATCGATCGGCGG  targeting   
4     ELF1_+_41593362.23-P1P2-2  GTGAGCTGATAAACAGAGGG  targeting   

  intended_target_name    reverse_compliment  
0                FOXN1  TCAGCTCGGCCGTCCTGTGC  
1                  EN2  TCCCGCGGCGCACACGGAGC  
2               BCLAF1  CTGTGTGGTTGCAACGGAGC  
3                 KLF6  CCGCCGATCGATCCTCCAGC  
4                 ELF1  CCCTCTGTTTATCAGCTCAC  
                     guide_id                spacer              type  \
0  TFEC_-_115670779.23-P1P2-1  GCATATGCACCATGCCAGAA         targeting   
1  NR2C1_-_95467292.23-P1P2-2  GGATGTGGGATCGAGATTCA         targeting   
2   NANOG_+_7942459.23-P1P2-2  GTTTTTCCATTATAACTTGG         targeting   
3                

In [170]:
# Add 'targeting' column; if type == targeting, set to True, otherwise False
def check_targeting(value):
    if(value == "targeting"):
        return True
    else:
        return False

def add_targeting_col(ref_clean_sub):
    ref_clean_sub['targeting'] = ref_clean_sub['type'].apply(check_targeting)
    order = ['guide_id', 'spacer', 'targeting', 'type', 'intended_target_name']
    ref_clean_sub = ref_clean_sub[order]
    print(ref_clean_sub.head())
    return ref_clean_sub

ref_clean_sub = add_targeting_col(ref_clean_sub)
ref_clean_sub_poolabcd = add_targeting_col(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = add_targeting_col(ref_clean_sub_poolf)

                       guide_id                spacer  targeting       type  \
0    FOXN1_-_26833391.23-P1P2-1  GCACAGGACGGCCGAGCTGA       True  targeting   
1     EN2_-_155251011.23-P1P2-1  GCTCCGTGTGCGCCGCGGGA       True  targeting   
2  BCLAF1_-_136610510.23-P1P2-2  GCTCCGTTGCAACCACACAG       True  targeting   
3      KLF6_-_3827130.23-P1P2-2  GCTGGAGGATCGATCGGCGG       True  targeting   
4     ELF1_+_41593362.23-P1P2-2  GTGAGCTGATAAACAGAGGG       True  targeting   

  intended_target_name  
0                FOXN1  
1                  EN2  
2               BCLAF1  
3                 KLF6  
4                 ELF1  
                     guide_id                spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  GCATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GGATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  GTTTTTCCATTATAACTTGG       True   
3                     OR8B3-5  GTTTTTGTCTTCAAAAATCT      False   
4  ZNF48_+_30406782.23-P1P2-1  GCTCCGCGCCAAGC

In [171]:
# Add PAM
def add_pam(ref_clean_sub):
    ref_clean_sub['pam'] = 'NGG'
    print(ref_clean_sub.head())
    print(ref_clean_sub.shape)
    return ref_clean_sub

ref_clean_sub = add_pam(ref_clean_sub)
ref_clean_sub_poolabcd = add_pam(ref_clean_sub_poolabcd)
ref_clean_sub_poolf = add_pam(ref_clean_sub_poolf)

                       guide_id                spacer  targeting       type  \
0    FOXN1_-_26833391.23-P1P2-1  GCACAGGACGGCCGAGCTGA       True  targeting   
1     EN2_-_155251011.23-P1P2-1  GCTCCGTGTGCGCCGCGGGA       True  targeting   
2  BCLAF1_-_136610510.23-P1P2-2  GCTCCGTTGCAACCACACAG       True  targeting   
3      KLF6_-_3827130.23-P1P2-2  GCTGGAGGATCGATCGGCGG       True  targeting   
4     ELF1_+_41593362.23-P1P2-2  GTGAGCTGATAAACAGAGGG       True  targeting   

  intended_target_name  pam  
0                FOXN1  NGG  
1                  EN2  NGG  
2               BCLAF1  NGG  
3                 KLF6  NGG  
4                 ELF1  NGG  
(19956, 6)
                     guide_id                spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  GCATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GGATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  GTTTTTCCATTATAACTTGG       True   
3                     OR8B3-5  GTTTTTGTCTTCAAAAATCT      False   
4  Z

In [172]:
print(sgrna_index_merged.head())
print(len(set(sgrna_index_merged['protospacer_upper']).intersection(set(ref_clean_sub_poolabcd['spacer']))))
print(len(set(sgrna_index_poolf['protospacer']).intersection(set(ref_clean_sub_poolf['spacer']))))  

               protospacer_ID           protospacer intended_target_name  \
0                     OR5K2-2  GAAAAAATTGTAGAGGAATA                OR5K2   
1    SP1_+_53773993.23-P1P2-1  GAAAAACGCGGACGCTGACG                  SP1   
2    SP8_-_20826141.23-P1P2-2  GAAAAAGATCCTCTGAGAGG                  SP8   
3    FOXN3_-_89883583.23-P2-1  GAAAAAGGCGACACATGACC                FOXN3   
4  ZNF85_+_21106076.23-P1P2-1  GAAAACAAGACCTAGAGCTC                ZNF85   

        type genomic_element    reverse_compliment     protospacer_upper  \
0  targeting        promoter  TATTCCTCTACAATTTTTTC  GAAAAAATTGTAGAGGAATA   
1  targeting        promoter  CGTCAGCGTCCGCGTTTTTC  GAAAAACGCGGACGCTGACG   
2  targeting        promoter  CCTCTCAGAGGATCTTTTTC  GAAAAAGATCCTCTGAGAGG   
3  targeting        promoter  GGTCATGTGTCGCCTTTTTC  GAAAAAGGCGACACATGACC   
4  targeting        promoter  GAGCTCTAGGTCTTGTTTTC  GAAAACAAGACCTAGAGCTC   

                   target_loc              element_seq      target  ...  \
0          

In [173]:
# Add the 'guide_chr', 'guide_start', and 'guide_end' values, which are given as 'chr_target', 'chr_start_target', 'chr_end_target', and 'strand'
def add_guide_coords(ref_clean_sub, sgrna_index_merged):
    ref_clean_sub = pd.merge(
        ref_clean_sub,
        sgrna_index_merged[['protospacer_upper', 'chr_target', 'chr_start_target', 'chr_end_target', 'strand']],
        left_on='spacer',
        right_on='protospacer_upper',
        how='left'
    )
    # Remove protospacer_upper column
    ref_clean_sub = ref_clean_sub.drop(columns=['protospacer_upper'])
    # Rename intended guide names
    ref_clean_sub.rename(columns={'chr_target': 'guide_chr', 
                                  'chr_start_target': 'guide_start',
                                  'chr_end_target': 'guide_end'},
                                  inplace=True)


    print(ref_clean_sub.head())
    return ref_clean_sub


ref_clean_sub_poolabcd = add_guide_coords(ref_clean_sub_poolabcd, sgrna_index_merged)
print(ref_clean_sub_poolabcd.head())

# Columns are already correctly labeled for pool F
ref_clean_sub_poolf = pd.merge(
    ref_clean_sub_poolf,
    sgrna_index_poolf[['protospacer', 'guide_chr', 'guide_start', 'guide_end', 'strand']],
    left_on='spacer',
    right_on='protospacer',
    how='left'
)
print(ref_clean_sub_poolf.head())

                     guide_id                spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  GCATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GGATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  GTTTTTCCATTATAACTTGG       True   
3                     OR8B3-5  GTTTTTGTCTTCAAAAATCT      False   
4  ZNF48_+_30406782.23-P1P2-1  GCTCCGCGCCAAGCCGGGAG       True   

               type intended_target_name  pam guide_chr  guide_start  \
0         targeting                 TFEC  NGG      chr7  116030705.0   
1         targeting                NR2C1  NGG     chr12   95073493.0   
2         targeting                NANOG  NGG     chr12    7789912.0   
3  negative_control                OR8B3  NGG       NaN          NaN   
4         targeting                ZNF48  NGG     chr16   30395465.0   

     guide_end strand  
0  116030723.0      +  
1   95073511.0      +  
2    7789930.0      +  
3          NaN    NaN  
4   30395483.0      -  
                     guide

In [174]:
# Add the intended_target_chr/intended_target_start/intended_target_end values, which are given as 'chr_element', 'chr_start_element', 'chr_end_element'
# Note that this refers to the element being targeted, not the gene itself
def add_element_coords(ref_clean_sub, sgrna_index_merged):
    ref_clean_sub = pd.merge(
        ref_clean_sub,
        sgrna_index_merged[['protospacer_upper', 'chr_element', 'chr_start_element', 'chr_end_element']],
        left_on='spacer',
        right_on='protospacer_upper',
        how='left'
    )
    # Remove protospacer_upper column
    ref_clean_sub = ref_clean_sub.drop(columns=['protospacer_upper'])
    # Rename intended target names
    ref_clean_sub.rename(columns={'chr_element': 'intended_target_chr', 
                                  'chr_start_element': 'intended_target_start',
                                  'chr_end_element': 'intended_target_end'},
                                  inplace=True)
    print(ref_clean_sub.head())
    return ref_clean_sub


ref_clean_sub_poolabcd = add_element_coords(ref_clean_sub_poolabcd, sgrna_index_merged)
ref_clean_sub_poolabcd.head()

# Columns are already correctly labeled for pool F
ref_clean_sub_poolf = pd.merge(
    ref_clean_sub_poolf,
    sgrna_index_poolf[['protospacer', 'intended_target_chr', 'intended_target_start', 'intended_target_end']],
    left_on='spacer',
    right_on='protospacer',
    how='left'
)
print(ref_clean_sub_poolf.head())

                     guide_id                spacer  targeting  \
0  TFEC_-_115670779.23-P1P2-1  GCATATGCACCATGCCAGAA       True   
1  NR2C1_-_95467292.23-P1P2-2  GGATGTGGGATCGAGATTCA       True   
2   NANOG_+_7942459.23-P1P2-2  GTTTTTCCATTATAACTTGG       True   
3                     OR8B3-5  GTTTTTGTCTTCAAAAATCT      False   
4  ZNF48_+_30406782.23-P1P2-1  GCTCCGCGCCAAGCCGGGAG       True   

               type intended_target_name  pam guide_chr  guide_start  \
0         targeting                 TFEC  NGG      chr7  116030705.0   
1         targeting                NR2C1  NGG     chr12   95073493.0   
2         targeting                NANOG  NGG     chr12    7789912.0   
3  negative_control                OR8B3  NGG       NaN          NaN   
4         targeting                ZNF48  NGG     chr16   30395465.0   

     guide_end strand intended_target_chr  intended_target_start  \
0  116030723.0      +                chr7            116030682.0   
1   95073511.0      +             

In [176]:
print(example_crispr_file.head())

# Reorganize columns to match
new_order = ['guide_id', 'spacer', 'targeting', 'type', 'guide_chr', 'guide_start', 'guide_end', 'strand', 'pam', 'intended_target_name', 'intended_target_chr', 'intended_target_start', 'intended_target_end']
ref_clean_sub_poolabcd = ref_clean_sub_poolabcd[new_order]
print(ref_clean_sub_poolabcd.head())
ref_clean_sub_poolf = ref_clean_sub_poolf[new_order]
print(ref_clean_sub_poolf.head())

   guide_id                spacer  targeting       type guide_chr  \
0  AFF4_sg1  CCAGCGGACGGGGCGGGGAC       True  targeting      chr5   
1  AFF4_sg2  CCGCCAGCGGACGGGGCGGC       True  targeting      chr5   
2  AFF4_sg3  CGTCCGCTGGCGGCGGCGAC       True  targeting      chr5   
3  AFF4_sg4  CTGCGTCAGTCACAGCCCTC       True  targeting      chr5   
4  AFF4_sg5  GCGGACGGGGCGGGGATCCC       True  targeting      chr5   

   guide_start    guide_end strand  pam intended_target_name  \
0  132299282.0  132299302.0      -  NGG                 AFF4   
1  132299282.0  132299302.0      -  NGG                 AFF4   
2  132299252.0  132299272.0      -  NGG                 AFF4   
3  132299279.0  132299299.0      -  NGG                 AFF4   
4  132299279.0  132299299.0      -  NGG                 AFF4   

  intended_target_chr  intended_target_start  intended_target_end  
0                chr5            132875395.0          132963634.0  
1                chr5            132875395.0          132963634.

In [177]:
# Write to file
ref_clean_sub_poolabcd.to_csv(local_path + "harmonized_guide_file_poolabcd.csv")
ref_clean_sub_poolf.to_csv(local_path + "harmonized_guide_file_poolf.csv")

In [180]:
# Also write a version without mostly NA values
print(ref_clean_sub_poolabcd.shape)
ref_clean_sub_poolabcd_clean = ref_clean_sub_poolabcd.dropna(thresh = (len(ref_clean_sub_poolabcd.columns)/2))
print(ref_clean_sub_poolabcd_clean.shape)
ref_clean_sub_poolabcd_clean.to_csv(local_path + "harmonized_guide_file_poolabcd_nomissing.csv")

(17645, 13)
(13751, 13)


---