Data Cleaning 
- Remove overlapping genes that have their TSS within 1kb of the TSS the selected genes and has at least overlap in category 2 or 3
- TSS position is defined by ‘cut’ site in CRISPick


In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy


In [2]:
def standardize_chromosome(chromosome):
    chromosome = str(chromosome).replace('.0', '')
    if not chromosome.startswith('chr'):
        chromosome = 'chr' + chromosome
    return chromosome

In [3]:
# In house CRISPRi Tiling
path_out = '../Data/PrimaryLibrary/processed_z_score/'
annot_guide_lfcs_A549_Kox1 = pd.read_csv(path_out + 'A549_Kox1_zscore.csv')
annot_guide_lfcs_A549_Kox1 = annot_guide_lfcs_A549_Kox1[annot_guide_lfcs_A549_Kox1.Category.isin(['Essential', 'Non-essential'])]

annot_guide_lfcs_A549_Zim3 = pd.read_csv(path_out +  'A549_Zim3_zscore.csv')
annot_guide_lfcs_A549_Zim3 = annot_guide_lfcs_A549_Zim3[annot_guide_lfcs_A549_Zim3.Category.isin(['Essential', 'Non-essential'])]

annot_guide_lfcs_HCT116_Kox1 = pd.read_csv(path_out +  'HCT116_Kox1_zscore.csv')
annot_guide_lfcs_HCT116_Kox1 = annot_guide_lfcs_HCT116_Kox1[annot_guide_lfcs_HCT116_Kox1.Category.isin(['Essential', 'Non-essential'])]

annot_guide_lfcs_HCT116_Zim3 = pd.read_csv(path_out +  'HCT116_Zim3_zscore.csv')
annot_guide_lfcs_HCT116_Zim3 = annot_guide_lfcs_HCT116_Zim3[annot_guide_lfcs_HCT116_Zim3.Category.isin(['Essential', 'Non-essential'])]



annot_guide_lfcs_A549_Kox1['Domain'] = 'Kox1'
annot_guide_lfcs_A549_Zim3['Domain'] = 'Zim3'
annot_guide_lfcs_HCT116_Kox1['Domain'] = 'Kox1'
annot_guide_lfcs_HCT116_Zim3['Domain'] = 'Zim3'

InHouseCleanData = pd.concat([annot_guide_lfcs_A549_Kox1, annot_guide_lfcs_A549_Zim3, annot_guide_lfcs_HCT116_Kox1, annot_guide_lfcs_HCT116_Zim3])

# Additional information for In house data
ess_noness = pd.read_csv('../Data/PrimaryLibrary/design/essentials_nonessentials_CRISPRi_tiling_designs_v2.csv')
ess_noness['chromosome'] = ess_noness['Reference Sequence'].apply(lambda x: int(x.split("NC_")[1].split(".")[0]) if pd.notnull(x) else x)

ess_noness = ess_noness[['Target Gene Symbol','Target Gene ID','Category','chromosome','sgRNA Sequence', 
           'sgRNA Context Sequence', 'TSS Position', 
       'sgRNA \'Cut\' Site TSS Offset', 'sgRNA \'Cut\' Position', 
            'Strand of Target','Strand of sgRNA', 'Orientation',
            'On-Target Ruleset','On-Target Efficacy Score']]
InHouseCleanData = pd.merge(InHouseCleanData, ess_noness)

InHouseCleanData.head()

Unnamed: 0,sgRNA Sequence,condition,avg_lfc,n_obs,Target Gene Symbol,Target Gene ID,Category,TSS Position,sgRNA 'Cut' Site TSS Offset,sgRNA Context Sequence,On-Target Ruleset,On-Target Efficacy Score,z_scored_avg_lfc,TSS Bucket,Domain,chromosome,sgRNA 'Cut' Position,Strand of Target,Strand of sgRNA,Orientation
0,AAAAAAAAAAAGGGCAGAAG,A549,-0.376159,2,APOBEC1,339.0,Non-essential,7665908.0,-999.0,AAAAAAAAAAAAAAAGGGCAGAAGTGGACT,RS3seq-Chen2013,-0.7248,-1.389004,"[-1000, -500)",Kox1,12.0,7666908.0,-,-,sense
1,AAAAAAAAAAATTAGACCTC,A549,-0.021644,2,MRGPRD,116512.0,Non-essential,68980986.0,-735.0,AAAAAAAAAAAAAAATTAGACCTCAGGGCA,RS3seq-Chen2013,-0.6309,-0.52814,"[-1000, -500)",Kox1,11.0,68981722.0,-,+,antisense
2,AAAAAAAAAAATTAGACCTC,HCT116,-0.257163,2,MRGPRD,116512.0,Non-essential,68980986.0,-735.0,AAAAAAAAAAAAAAATTAGACCTCAGGGCA,RS3seq-Chen2013,-0.6309,-1.381448,"[-1000, -500)",Kox1,11.0,68981722.0,-,+,antisense
3,AAAAAAAAAAATTAGACCTC,HCT116,-0.277625,2,MRGPRD,116512.0,Non-essential,68980986.0,-735.0,AAAAAAAAAAAAAAATTAGACCTCAGGGCA,RS3seq-Chen2013,-0.6309,-1.315569,"[-1000, -500)",Zim3,11.0,68981722.0,-,+,antisense
4,AAAAAAAAAACAGGACACAG,A549,0.03181,2,LARS2,23395.0,Essential,45388577.0,667.0,CAGAAAAAAAAAAACAGGACACAGGGGAGA,RS3seq-Chen2013,0.014,-0.398339,"[500, 1000)",Kox1,3.0,45389244.0,+,-,antisense


In [4]:
# Nunze dataset
NunzeCleanData = pd.read_csv('../Data/CleanedExternalData/Nunez2020Cleaned2024_v1.csv')
NunzeCleanData['condition'] = 'K562'
NunzeCleanData['Domain'] = 'Kox1'
NunzeCleanData.groupby('Target Gene Symbol')['strand'].value_counts()

Target Gene Symbol  strand
ACTR6               +         159
                    -         146
AFG3L2              -         165
                    +         119
ANAPC13             +         139
                             ... 
ZNF574              +         110
ZNF830              +         105
                    -          93
ZNHIT6              +         127
                    -          52
Name: strand, Length: 925, dtype: int64

In [5]:
# Gilbert Ricin Tiling
GilbertRicinCleanData = pd.read_csv('../Data/CleanedExternalData/Gilbert2014Cleaned2024_v1.csv')
GilbertRicinCleanData['condition'] = 'K562'
GilbertRicinCleanData['Domain'] = 'Kox1'
GilbertRicinCleanData.groupby('Target Gene Symbol')['strand targeted'].value_counts()

Target Gene Symbol  strand targeted
ARCN1               -                  154
                    +                  126
ARF1                -                  521
                    +                  485
ARL1                +                  159
                                      ... 
VPS54               -                  165
WDR11               -                  129
                    +                   99
YIPF5               -                  185
                    +                  166
Name: strand targeted, Length: 94, dtype: int64

In [6]:
len(GilbertRicinCleanData)

24791

# Cut Site reassign 

In [7]:
# cut site crispick
def CRISPickCutSite(df, strand_col, point_col, context_seq_col):
    CutSiteList = []
    for index, row in df.iterrows():
        if row[strand_col] == '+':
            CutSiteList.append(row[point_col] + 7)
        elif row[strand_col] == '-':
            CutSiteList.append(row[point_col] - 4)
        else:
            print(row[context_seq_col] + f'strand is neither + nor -')
    return CutSiteList




In [8]:
geneListGilbertAndNunez = GilbertRicinCleanData['Target Gene Symbol'].unique().tolist() + NunzeCleanData['Target Gene Symbol'].unique().tolist()

pd.DataFrame(geneListGilbertAndNunez).to_csv('../Data/ExternalData/CRISPick/geneListGilbertAndNunez.csv',
                              index = False)
#ran CRISPick https://portals.broadinstitute.org/gppx/crispick/platform/results/1da3859a-902d-471a-b06b-7aa43da31abd
# NCBI GRCh38 is used because it has MANE select TSS Position
CRISPick_geneListGilbertAndNunez = pd.read_table('../Data/ExternalData/CRISPick/geneListGilbertAndNunez-sgrna-designs.txt')
CRISPick_geneListGilbertAndNunez = CRISPick_geneListGilbertAndNunez[[ 
       'Target Gene Symbol',  'Reference Sequence', 'Strand of Target',
       'TSS Position']]

# Gilbert

In [9]:
GilbertRicinCleanDataCutSite = CRISPickCutSite(GilbertRicinCleanData,'strand targeted' ,'PAM genomic coordinate [hg38]', 'context seq')
GilbertRicinCleanData['CRISPick CutSite'] = GilbertRicinCleanDataCutSite
GilbertRicinCleanData = pd.merge(GilbertRicinCleanData,CRISPick_geneListGilbertAndNunez, 
                                                 on = 'Target Gene Symbol')
GilbertRicinCleanData.head()


Unnamed: 0,Target Gene Symbol,chromosome,strand targeted,PAM genomic coordinate [hg38],context seq,guide id,oligo,sgRNA sequence,dCas9-KRAB_rho_ave_LG2_LG_MH2_MH,zscoreRelativeNC,...,Target Gene Function,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position
0,ARCN1,chr11,+,118567638.0,GCACTTTCCTTGTTTACCTCTGGTAGGTTT,ARCN1_w_118438354.27,cccttggagaaCCAcctTGTTGGCACTTTCCTTGTTTACCTCTGGT...,GCACTTTCCTTGTTTACCTCTGGT,0.001556,0.250358,...,Resistance,-0.001556,-0.250358,0.134928,K562,Kox1,118567645.0,NC_000011.10,+,118572410
1,ARCN1,chr11,+,118568026.0,GCAACCCTGTAAGCATGCTTCTTGAGGAGT,ARCN1_w_118438742.27,cccttggagaaCCAcctTGTTGGCAACCCTGTAAGCATGCTTCTTG...,GCAACCCTGTAAGCATGCTTCTTG,-0.011286,0.157641,...,Resistance,0.011286,-0.157641,0.166789,K562,Kox1,118568033.0,NC_000011.10,+,118572410
2,ARCN1,chr11,+,118568080.0,GATGCTGCTTCTTCGGAAATATAACGGTAG,ARCN1_w_118438796.24,cccttggagaaCCAcctTGTTGGCTGCTTCTTCGGAAATATAAGTT...,GCTGCTTCTTCGGAAATATAA,0.01815,0.21634,...,Resistance,-0.01815,-0.21634,0.146618,K562,Kox1,118568087.0,NC_000011.10,+,118572410
3,ARCN1,chr11,+,118568134.0,TAGCACCTTCCAGGGCTTCCAACTTGGATC,ARCN1_w_118438850.25,cccttggagaaCCAcctTGTTGGCACCTTCCAGGGCTTCCAACTGT...,GCACCTTCCAGGGCTTCCAACT,0.021472,0.320467,...,Resistance,-0.021472,-0.320467,0.110835,K562,Kox1,118568141.0,NC_000011.10,+,118572410
4,ARCN1,chr11,+,118568146.0,ATTGCCTTATTGTAGCACCTTCCAGGGCTT,ARCN1_w_118438862.24,cccttggagaaCCAcctTGTTGGCCTTATTGTAGCACCTTCCAGTT...,GCCTTATTGTAGCACCTTCCA,-0.018377,-0.708019,...,Resistance,0.018377,0.708019,0.464269,K562,Kox1,118568153.0,NC_000011.10,+,118572410


In [10]:
#calculate guide distance from MANESelect annotated TSS 
GilbertRicinCleanData["Distance to TSS"] = GilbertRicinCleanData.apply(
    lambda r:(r["CRISPick CutSite"] - r["TSS Position"]) if r["Strand of Target"] == "+" \
              else (r["TSS Position"] - r["CRISPick CutSite"]), axis=1)
GilbertRicinCleanData = GilbertRicinCleanData[abs(GilbertRicinCleanData['Distance to TSS']) <= 5000]
GilbertRicinCleanData.head()

Unnamed: 0,Target Gene Symbol,chromosome,strand targeted,PAM genomic coordinate [hg38],context seq,guide id,oligo,sgRNA sequence,dCas9-KRAB_rho_ave_LG2_LG_MH2_MH,zscoreRelativeNC,...,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position,Distance to TSS
0,ARCN1,chr11,+,118567638.0,GCACTTTCCTTGTTTACCTCTGGTAGGTTT,ARCN1_w_118438354.27,cccttggagaaCCAcctTGTTGGCACTTTCCTTGTTTACCTCTGGT...,GCACTTTCCTTGTTTACCTCTGGT,0.001556,0.250358,...,-0.001556,-0.250358,0.134928,K562,Kox1,118567645.0,NC_000011.10,+,118572410,-4765.0
1,ARCN1,chr11,+,118568026.0,GCAACCCTGTAAGCATGCTTCTTGAGGAGT,ARCN1_w_118438742.27,cccttggagaaCCAcctTGTTGGCAACCCTGTAAGCATGCTTCTTG...,GCAACCCTGTAAGCATGCTTCTTG,-0.011286,0.157641,...,0.011286,-0.157641,0.166789,K562,Kox1,118568033.0,NC_000011.10,+,118572410,-4377.0
2,ARCN1,chr11,+,118568080.0,GATGCTGCTTCTTCGGAAATATAACGGTAG,ARCN1_w_118438796.24,cccttggagaaCCAcctTGTTGGCTGCTTCTTCGGAAATATAAGTT...,GCTGCTTCTTCGGAAATATAA,0.01815,0.21634,...,-0.01815,-0.21634,0.146618,K562,Kox1,118568087.0,NC_000011.10,+,118572410,-4323.0
3,ARCN1,chr11,+,118568134.0,TAGCACCTTCCAGGGCTTCCAACTTGGATC,ARCN1_w_118438850.25,cccttggagaaCCAcctTGTTGGCACCTTCCAGGGCTTCCAACTGT...,GCACCTTCCAGGGCTTCCAACT,0.021472,0.320467,...,-0.021472,-0.320467,0.110835,K562,Kox1,118568141.0,NC_000011.10,+,118572410,-4269.0
4,ARCN1,chr11,+,118568146.0,ATTGCCTTATTGTAGCACCTTCCAGGGCTT,ARCN1_w_118438862.24,cccttggagaaCCAcctTGTTGGCCTTATTGTAGCACCTTCCAGTT...,GCCTTATTGTAGCACCTTCCA,-0.018377,-0.708019,...,0.018377,0.708019,0.464269,K562,Kox1,118568153.0,NC_000011.10,+,118572410,-4257.0


# Nunze

In [11]:
NunzeCleanDataCutSite = CRISPickCutSite(NunzeCleanData,'strand' ,'hg38_coord', 'context_seq')
NunzeCleanData['CRISPick CutSite'] = NunzeCleanDataCutSite
NunzeCleanData = pd.merge(NunzeCleanData, CRISPick_geneListGilbertAndNunez,
                          on= ['Target Gene Symbol'],how = 'inner')
NunzeCleanData.head(3)

Unnamed: 0,ID,Target Gene Symbol,chromosome,strand,sequence,context_seq,hg38_coord,start_coord_hg38,end_coord_hg38,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position
0,ACTR6_+_100592059,ACTR6,chr12,+,GCGAGCACATAGAGGGATCT,TGCTTCGAGCACATAGAGGGATCTGGGCCC,100198281,100198284,100198303,-0.018881,0.035104,-0.889828,K562,Kox1,100198288,NC_000012.12,+,100200816
1,ACTR6_+_100592060,ACTR6,chr12,+,GTCGAGCACATAGAGGGATC,CTGCTTCGAGCACATAGAGGGATCTGGGCC,100198282,100198285,100198304,0.009292,0.295731,0.580559,K562,Kox1,100198289,NC_000012.12,+,100200816
2,ACTR6_+_100592066,ACTR6,chr12,+,GCCTGCTTCGAGCACATAGA,GTCCACCTGCTTCGAGCACATAGAGGGATC,100198288,100198291,100198310,0.000734,0.214023,0.119582,K562,Kox1,100198295,NC_000012.12,+,100200816


In [12]:
#calculate guide distance from MANESelect annotated TSS 
NunzeCleanData["Distance to TSS"] = NunzeCleanData.apply(
    lambda r:(r["CRISPick CutSite"] - r["TSS Position"]) if r["Strand of Target"] == "+" \
              else (r["TSS Position"] - r["CRISPick CutSite"]), axis=1)
NunzeCleanData = NunzeCleanData[abs(NunzeCleanData['Distance to TSS'])<=5000]
NunzeCleanData.head(3)

Unnamed: 0,ID,Target Gene Symbol,chromosome,strand,sequence,context_seq,hg38_coord,start_coord_hg38,end_coord_hg38,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position,Distance to TSS
0,ACTR6_+_100592059,ACTR6,chr12,+,GCGAGCACATAGAGGGATCT,TGCTTCGAGCACATAGAGGGATCTGGGCCC,100198281,100198284,100198303,-0.018881,0.035104,-0.889828,K562,Kox1,100198288,NC_000012.12,+,100200816,-2528
1,ACTR6_+_100592060,ACTR6,chr12,+,GTCGAGCACATAGAGGGATC,CTGCTTCGAGCACATAGAGGGATCTGGGCC,100198282,100198285,100198304,0.009292,0.295731,0.580559,K562,Kox1,100198289,NC_000012.12,+,100200816,-2527
2,ACTR6_+_100592066,ACTR6,chr12,+,GCCTGCTTCGAGCACATAGA,GTCCACCTGCTTCGAGCACATAGAGGGATC,100198288,100198291,100198310,0.000734,0.214023,0.119582,K562,Kox1,100198295,NC_000012.12,+,100200816,-2521


# Combine datasets together

In [13]:
GilbertRicinCleanData = deepcopy(GilbertRicinCleanData[['Target Gene Symbol','chromosome','sgRNA sequence', 'context seq', 
                                'CRISPick CutSite', 'Strand of Target', 'TSS Position', 'Distance to TSS',
                                'rhoSignChangedAndAvged','zscoreRelativeNC_signed', 'Avg_LFC_signed', 
                                'condition', 'Domain']])


GilbertRicinCleanData.rename(columns={
                                      'sgRNA sequence':'sgRNA Sequence', 
                                      'context seq':'sgRNA Context Sequence',
                                      'CRISPick CutSite': 'sgRNA \'Cut\' Position', 
                                      'Distance to TSS':'sgRNA \'Cut\' Site TSS Offset'
                                     }, inplace = True)

In [14]:
NunzeCleanData = deepcopy(NunzeCleanData[['Target Gene Symbol','chromosome','sequence', 
                                               'context_seq', 'CRISPick CutSite',
                                                       'Strand of Target', 'TSS Position', 'Distance to TSS',
                                                       'Phenotype scores-ave_Rep1_Rep2','Avg_LFC-Tfinal', 
                         'z-score relative to NC','condition','Domain']])

NunzeCleanData.rename(columns={'sequence':'sgRNA Sequence', 
                                    'context_seq':'sgRNA Context Sequence',
                                   'CRISPick CutSite': 'sgRNA \'Cut\' Position', 
                                      'Distance to TSS':'sgRNA \'Cut\' Site TSS Offset'}, inplace = True)

In [15]:
InHouseCleanData = deepcopy(InHouseCleanData\
                                 [['Target Gene Symbol','chromosome','sgRNA Sequence', 
                                   'sgRNA Context Sequence','sgRNA \'Cut\' Position',
                                   'Strand of Target', 'TSS Position', 'sgRNA \'Cut\' Site TSS Offset',
                                                       'z_scored_avg_lfc',
                                                              'condition','Domain']])



In [16]:
NunzeCleanData['DataSet'] = 'Nunez'
GilbertRicinCleanData['DataSet'] = 'Gilbert'
InHouseCleanData['DataSet'] = 'InHouse'

In [17]:
DatasetCombine = pd.concat([NunzeCleanData, GilbertRicinCleanData, InHouseCleanData]).reset_index(drop = True)
#na in some columns is because the corrsponding dataset doesnt has that phenotype

# Quality Check

In [18]:
DatasetCombine[DatasetCombine['sgRNA Context Sequence'].duplicated(False)]['sgRNA Context Sequence'].value_counts()

GGGACCTCATATTGTGCCCGCCTAGGGACT    6
GCCGTCCCCAGGCAGAATTAGAGTCGGGGT    6
TCCTGGGCAGAGGCTAGTGAGGCCGGGGTC    6
GGAGTGGAGAGATATCCTGGGCAGAGGCTA    6
CAGGGAGGAGTGGAGAGATATCCTGGGCAG    6
                                 ..
GTATCCTGAACTTTGAGGAGTTATGGGAGG    2
TGTATCCTGAACTTTGAGGAGTTATGGGAG    2
GCGCCTGGTGTATCCTGAACTTTGAGGAGT    2
GACTTTATTGTGAATCACAGCGCCTGGTGT    2
GCCTTCGCCGCTCGGGCCGCCCGGGGGAAA    2
Name: sgRNA Context Sequence, Length: 107393, dtype: int64

In [19]:
# the same context is targeting two genes aka gene overlap
DatasetCombine[DatasetCombine['sgRNA Context Sequence'] == 'CTGAATATTTATTGCAACCTGCGGTGGCTG']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
105792,PRKDC,chr8,GAATATTTATTGCAACCTGCGG,CTGAATATTTATTGCAACCTGCGGTGGCTG,47961937.0,-,47960136.0,-1801.0,,,,K562,Kox1,Gilbert,0.049736,0.464529,0.380595,
198691,MCM4,8.0,ATATTTATTGCAACCTGCGG,CTGAATATTTATTGCAACCTGCGGTGGCTG,47961937.0,+,47960942.0,995.0,,,,A549,Kox1,InHouse,,,,-2.4795
198692,MCM4,8.0,ATATTTATTGCAACCTGCGG,CTGAATATTTATTGCAACCTGCGGTGGCTG,47961937.0,+,47960942.0,995.0,,,,A549,Zim3,InHouse,,,,-1.501068
198693,MCM4,8.0,ATATTTATTGCAACCTGCGG,CTGAATATTTATTGCAACCTGCGGTGGCTG,47961937.0,+,47960942.0,995.0,,,,HCT116,Kox1,InHouse,,,,-1.70009
198694,MCM4,8.0,ATATTTATTGCAACCTGCGG,CTGAATATTTATTGCAACCTGCGGTGGCTG,47961937.0,+,47960942.0,995.0,,,,HCT116,Zim3,InHouse,,,,-1.084287


In [20]:
# the scale of the overlap problem in combined dataset
conseq_with_varied_gene = DatasetCombine.groupby('sgRNA Context Sequence').filter(lambda DatasetCombine: DatasetCombine['Target Gene Symbol'].nunique() > 1)
conseq_with_varied_gene.sort_values(by = 'sgRNA Context Sequence')

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
25811,EMG1,chr12,GCGCGTGGTGAGGGGAAACC,AAAAACGCGTGGTGAGGGGAAACCGGGCAA,6969946.0,+,6970914.0,-968.0,-0.032545,-0.094882,-1.623168,K562,Kox1,Nunez,,,,
155817,PHB2,12.0,ACGCGTGGTGAGGGGAAACC,AAAAACGCGTGGTGAGGGGAAACCGGGCAA,6969946.0,-,6970664.0,719.0,,,,A549,Kox1,InHouse,,,,-4.518868
155818,PHB2,12.0,ACGCGTGGTGAGGGGAAACC,AAAAACGCGTGGTGAGGGGAAACCGGGCAA,6969946.0,-,6970664.0,719.0,,,,A549,Zim3,InHouse,,,,-6.784053
155819,PHB2,12.0,ACGCGTGGTGAGGGGAAACC,AAAAACGCGTGGTGAGGGGAAACCGGGCAA,6969946.0,-,6970664.0,719.0,,,,HCT116,Kox1,InHouse,,,,-8.892879
155820,PHB2,12.0,ACGCGTGGTGAGGGGAAACC,AAAAACGCGTGGTGAGGGGAAACCGGGCAA,6969946.0,-,6970664.0,719.0,,,,HCT116,Zim3,InHouse,,,,-8.323199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351648,PTCD3,2.0,GATCTGTGATAGTTATAGCT,TTTTGATCTGTGATAGTTATAGCTTGGAAC,86106851.0,+,86106236.0,615.0,,,,A549,Kox1,InHouse,,,,-0.048226
351650,PTCD3,2.0,GATCTGTGATAGTTATAGCT,TTTTGATCTGTGATAGTTATAGCTTGGAAC,86106851.0,+,86106236.0,615.0,,,,HCT116,Kox1,InHouse,,,,-1.620939
351651,PTCD3,2.0,GATCTGTGATAGTTATAGCT,TTTTGATCTGTGATAGTTATAGCTTGGAAC,86106851.0,+,86106236.0,615.0,,,,HCT116,Zim3,InHouse,,,,-1.605212
56877,POLR1A,chr2,GATCTGTGATAGTTATAGCT,TTTTGATCTGTGATAGTTATAGCTTGGAAC,86106851.0,-,86105886.0,-965.0,-0.006604,0.115769,-0.434737,K562,Kox1,Nunez,,,,


In [21]:
# how does my calcualtion of CRISPick cut site compares to the actual in house dataset
DatasetCombine[DatasetCombine['sgRNA Context Sequence'] == 'GACGCTAGGGAAAAGCTGGCTCTCTGGGAT']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
13674,COG3,chr13,GTAGGGAAAAGCTGGCTCTC,GACGCTAGGGAAAAGCTGGCTCTCTGGGAT,45465106.0,+,45464940.0,166.0,-0.143685,-0.927616,-6.321215,K562,Kox1,Nunez,,,,
99671,COG3,chr13,GCTAGGGAAAAGCTGGCTCTC,GACGCTAGGGAAAAGCTGGCTCTCTGGGAT,45465106.0,+,45464940.0,166.0,,,,K562,Kox1,Gilbert,0.047545,0.646435,0.443106,
299591,COG3,13.0,CTAGGGAAAAGCTGGCTCTC,GACGCTAGGGAAAAGCTGGCTCTCTGGGAT,45465106.0,+,45464940.0,166.0,,,,A549,Kox1,InHouse,,,,-3.71682
299592,COG3,13.0,CTAGGGAAAAGCTGGCTCTC,GACGCTAGGGAAAAGCTGGCTCTCTGGGAT,45465106.0,+,45464940.0,166.0,,,,A549,Zim3,InHouse,,,,-7.612488
299593,COG3,13.0,CTAGGGAAAAGCTGGCTCTC,GACGCTAGGGAAAAGCTGGCTCTCTGGGAT,45465106.0,+,45464940.0,166.0,,,,HCT116,Kox1,InHouse,,,,-3.602514
299594,COG3,13.0,CTAGGGAAAAGCTGGCTCTC,GACGCTAGGGAAAAGCTGGCTCTCTGGGAT,45465106.0,+,45464940.0,166.0,,,,HCT116,Zim3,InHouse,,,,-3.972307


In [22]:
DatasetCombine[DatasetCombine['sgRNA Context Sequence'] == 'AACGCTTGCGCGCTCACCGTGGATAGGCGC']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
7527,CCT7,chr2,GTTGCGCGCTCACCGTGGAT,AACGCTTGCGCGCTCACCGTGGATAGGCGC,73233747.0,+,73234309.0,-562.0,-0.035603,-0.127112,-1.805003,K562,Kox1,Nunez,,,,
98609,CCT7,chr2,GCTTGCGCGCTCACCGTGGAT,AACGCTTGCGCGCTCACCGTGGATAGGCGC,73233747.0,+,73234309.0,-562.0,,,,K562,Kox1,Gilbert,0.015746,0.735152,0.473593,
323572,CCT7,2.0,CTTGCGCGCTCACCGTGGAT,AACGCTTGCGCGCTCACCGTGGATAGGCGC,73233747.0,+,73234309.0,-562.0,,,,A549,Kox1,InHouse,,,,-4.647993
323573,CCT7,2.0,CTTGCGCGCTCACCGTGGAT,AACGCTTGCGCGCTCACCGTGGATAGGCGC,73233747.0,+,73234309.0,-562.0,,,,A549,Zim3,InHouse,,,,-6.681559
323574,CCT7,2.0,CTTGCGCGCTCACCGTGGAT,AACGCTTGCGCGCTCACCGTGGATAGGCGC,73233747.0,+,73234309.0,-562.0,,,,HCT116,Kox1,InHouse,,,,-0.56599
323575,CCT7,2.0,CTTGCGCGCTCACCGTGGAT,AACGCTTGCGCGCTCACCGTGGATAGGCGC,73233747.0,+,73234309.0,-562.0,,,,HCT116,Zim3,InHouse,,,,0.258887


In [23]:
DatasetCombine[DatasetCombine['sgRNA Context Sequence'] == 'GACACCCGCGTTGTGTGTCCATGACGGTGC']
#Nunez

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
32372,HEATR1,chr1,GCCGCGTTGTGTGTCCATGA,GACACCCGCGTTGTGTGTCCATGACGGTGC,236604217.0,-,236604516.0,299.0,-0.10189,-0.731931,-5.217218,K562,Kox1,Nunez,,,,
103561,HEATR1,chr1,GACACCCGCGTTGTGTGTCCATGA,GACACCCGCGTTGTGTGTCCATGACGGTGC,236604217.0,-,236604516.0,299.0,,,,K562,Kox1,Gilbert,0.010282,-0.400256,0.083416,
255154,HEATR1,1.0,CCCGCGTTGTGTGTCCATGA,GACACCCGCGTTGTGTGTCCATGACGGTGC,236604217.0,-,236604516.0,300.0,,,,A549,Kox1,InHouse,,,,-5.703113
255155,HEATR1,1.0,CCCGCGTTGTGTGTCCATGA,GACACCCGCGTTGTGTGTCCATGACGGTGC,236604217.0,-,236604516.0,300.0,,,,A549,Zim3,InHouse,,,,-7.45427
255156,HEATR1,1.0,CCCGCGTTGTGTGTCCATGA,GACACCCGCGTTGTGTGTCCATGACGGTGC,236604217.0,-,236604516.0,300.0,,,,HCT116,Kox1,InHouse,,,,-8.94344
255157,HEATR1,1.0,CCCGCGTTGTGTGTCCATGA,GACACCCGCGTTGTGTGTCCATGACGGTGC,236604217.0,-,236604516.0,300.0,,,,HCT116,Zim3,InHouse,,,,-7.069832


In [24]:
# remove overlap
overlapping_gene = pd.read_csv('../Data/CRISPRiChallenges/geneOverlap/GW_ensembl_protein_coding_df_1kb.csv')
overlapping_gene_2_3 =overlapping_gene[overlapping_gene['overlap type'].isin([2,3])]
gene_2_3_unique = set(list(overlapping_gene_2_3['Selected Gene name']) +\
list(overlapping_gene_2_3['Overlapping Gene name']))
NewDatasetCombine = deepcopy(DatasetCombine[~DatasetCombine['Target Gene Symbol'].isin(gene_2_3_unique)])

In [25]:
# check the overlap issue after removing overlap genes in 2 or 3 
conseq_with_varied_gene = NewDatasetCombine.groupby('sgRNA Context Sequence').filter(lambda NewDatasetCombine: NewDatasetCombine['Target Gene Symbol'].nunique() > 1)
len(conseq_with_varied_gene)

0

In [26]:
NewDatasetCombine[NewDatasetCombine['sgRNA Context Sequence'].duplicated(False)]['sgRNA Context Sequence'].value_counts()

AGTCGTACCCATTTAAAGCCACGTCGGTAC    6
GCGGGACCGAGGCCAGTACCGACGTGGCTT    6
CTGGGAATTTGGGTATATCTTGGAAGGCAA    6
GGAACTGGGAATTTGGGTATATCTTGGAAG    6
GCTCTGTGCCGCTTACCTGGAACTGGGAAT    6
                                 ..
TGGGGGCAAGACAGTGAAAACAGGTGGGCA    2
GAATGCCTGTGAAAGATTGTATAAAGGAAC    2
GAAAGATTGTATAAAGGAACAAACAGGCCC    2
GGTTTGCTTTCAGCTTTTGAGTACTGGGCT    2
GCCTTCGCCGCTCGGGCCGCCCGGGGGAAA    2
Name: sgRNA Context Sequence, Length: 86955, dtype: int64

In [27]:
NewDatasetCombine[NewDatasetCombine['sgRNA Context Sequence'] == 'GGAACTGGGAATTTGGGTATATCTTGGAAG']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
32471,HEATR1,chr1,GTGGGAATTTGGGTATATCT,GGAACTGGGAATTTGGGTATATCTTGGAAG,236604445.0,-,236604516.0,71.0,-0.387244,-1.270996,-8.25847,K562,Kox1,Nunez,,,,
103722,HEATR1,chr1,GGAACTGGGAATTTGGGTATATCT,GGAACTGGGAATTTGGGTATATCTTGGAAG,236604445.0,-,236604516.0,71.0,,,,K562,Kox1,Gilbert,0.131272,3.193287,1.318318,
316247,HEATR1,1.0,CTGGGAATTTGGGTATATCT,GGAACTGGGAATTTGGGTATATCTTGGAAG,236604445.0,-,236604516.0,72.0,,,,A549,Kox1,InHouse,,,,-8.086635
316248,HEATR1,1.0,CTGGGAATTTGGGTATATCT,GGAACTGGGAATTTGGGTATATCTTGGAAG,236604445.0,-,236604516.0,72.0,,,,A549,Zim3,InHouse,,,,-8.911877
316249,HEATR1,1.0,CTGGGAATTTGGGTATATCT,GGAACTGGGAATTTGGGTATATCTTGGAAG,236604445.0,-,236604516.0,72.0,,,,HCT116,Kox1,InHouse,,,,-9.114155
316250,HEATR1,1.0,CTGGGAATTTGGGTATATCT,GGAACTGGGAATTTGGGTATATCTTGGAAG,236604445.0,-,236604516.0,72.0,,,,HCT116,Zim3,InHouse,,,,-9.410548


In [28]:
NewDatasetCombine[NewDatasetCombine['sgRNA Context Sequence'] == 'CTGCTGCCATCGTCTTTGCATCTCCGGGGT']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
32374,HEATR1,chr1,GGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,256.0,0.001632,0.222083,0.165053,K562,Kox1,Nunez,,,,
103565,HEATR1,chr1,GCTGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,256.0,,,,K562,Kox1,Gilbert,-0.057501,-0.69649,-0.018383,
500476,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,A549,Kox1,InHouse,,,,-4.694686
500477,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,A549,Zim3,InHouse,,,,-4.814877
500478,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,HCT116,Kox1,InHouse,,,,-4.79297
500479,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,HCT116,Zim3,InHouse,,,,-4.389959


In [29]:
NewDatasetCombine[NewDatasetCombine['sgRNA Context Sequence'] == 'GGCTCTGTGCCGCTTACCTGGAACTGGGAA']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
32469,HEATR1,chr1,GTGTGCCGCTTACCTGGAAC,GGCTCTGTGCCGCTTACCTGGAACTGGGAA,236604426.0,-,236604516.0,90.0,0.001559,0.216603,0.134137,K562,Kox1,Nunez,,,,
103719,HEATR1,chr1,GCTCTGTGCCGCTTACCTGGAAC,GGCTCTGTGCCGCTTACCTGGAACTGGGAA,236604426.0,-,236604516.0,90.0,,,,K562,Kox1,Gilbert,0.037329,1.38622,0.697329,
319086,HEATR1,1.0,CTGTGCCGCTTACCTGGAAC,GGCTCTGTGCCGCTTACCTGGAACTGGGAA,236604426.0,-,236604516.0,91.0,,,,A549,Kox1,InHouse,,,,-1.873853
319087,HEATR1,1.0,CTGTGCCGCTTACCTGGAAC,GGCTCTGTGCCGCTTACCTGGAACTGGGAA,236604426.0,-,236604516.0,91.0,,,,A549,Zim3,InHouse,,,,-1.657091
319088,HEATR1,1.0,CTGTGCCGCTTACCTGGAAC,GGCTCTGTGCCGCTTACCTGGAACTGGGAA,236604426.0,-,236604516.0,91.0,,,,HCT116,Kox1,InHouse,,,,-4.566195
319089,HEATR1,1.0,CTGTGCCGCTTACCTGGAAC,GGCTCTGTGCCGCTTACCTGGAACTGGGAA,236604426.0,-,236604516.0,91.0,,,,HCT116,Zim3,InHouse,,,,-1.330788


In [30]:
# temp fix for why crispick is one off for neg target gene
NewDatasetCombine.loc[(NewDatasetCombine['DataSet'].isin(['Nunez', 'Gilbert'])) & (NewDatasetCombine['Strand of Target'] == '-'), 
                      'sgRNA \'Cut\' Site TSS Offset'] += 1


In [31]:
NewDatasetCombine[NewDatasetCombine['sgRNA Context Sequence'] == 'CTGCTGCCATCGTCTTTGCATCTCCGGGGT']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,rhoSignChangedAndAvged,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
32374,HEATR1,chr1,GGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,0.001632,0.222083,0.165053,K562,Kox1,Nunez,,,,
103565,HEATR1,chr1,GCTGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,K562,Kox1,Gilbert,-0.057501,-0.69649,-0.018383,
500476,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,A549,Kox1,InHouse,,,,-4.694686
500477,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,A549,Zim3,InHouse,,,,-4.814877
500478,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,HCT116,Kox1,InHouse,,,,-4.79297
500479,HEATR1,1.0,TGCCATCGTCTTTGCATCTC,CTGCTGCCATCGTCTTTGCATCTCCGGGGT,236604260.0,-,236604516.0,257.0,,,,HCT116,Zim3,InHouse,,,,-4.389959


In [32]:
NewDatasetCombine['sgRNA Context Sequence'].nunique()

166624

In [33]:
len(NewDatasetCombine)

431095

In [34]:
NewDatasetCombine['chromosome'] = NewDatasetCombine['chromosome'].apply(standardize_chromosome)


In [35]:
outpath = '../Data/CleanedExternalData/'

NewDatasetCombine.to_csv(outpath + 'DatasetCombineCleaned_v3.csv', index = False)