### Klingbeil et al Screen

This notebook preprocesses CRISPR screen data from *Klingbeil et al., 2024*  
It loads GEMINI data from the paper’s supplementary tables (Excel format), cleans and formats the data, and prepares the dataset for downstream feature annotation and training of the Random Forest classifier.

**Inputs:**  
- Excel file(s) containing GEMINI scores from the *Klingbeil et al., 2024* supplementary materials.

**Outputs:**  
- A cleaned and processed dataset saved as a CSV file, ready for feature annotation and model training.


In [1]:
# import modules
import os
import pandas as pd
import numpy as np
from natsort import natsorted

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_gemini_data = get_data_path(['input', 'CRISPR_screens'], 'cd-23-1529_supplementary_table_s4_suppst4.xlsx')
file_path_lfc = get_data_path(['input', 'CRISPR_screens'], 'cd-23-1529_supplementary_table_s2_suppst2.xlsx')
file_path_fdr = get_data_path(['input', 'CRISPR_screens'], 'cd-23-1529_supplementary_table_s6_suppst6.xlsx')

file_path_genenames = get_data_path(['input', 'other'], 'approved_and_previous_symbols.csv')

file_path_sample_info = get_data_path(['input', 'DepMap22Q4'], 'sample_info.csv')

file_path_processed_klingbeil_df = get_data_path(['output', 'processed_CRISPR_screens'], 'processed_klingbeil_df.csv')

### Load and Parse Raw Data

**Load GEMINI scores and parse gene combinations:**
- Extract individual gene symbols (A1, A2) from GENE_COMBINATION column
- Extract domain information for each gene pair

In [3]:
klingbeil_gemini_data = pd.read_excel(file_path_gemini_data, header=0)
klingbeil_gemini_data.head()

Unnamed: 0,GENE_COMBINATION,A549,ASPC1,CORL311,H1048,H1299,H1436,H1836,H209,H211,...,MDAMB231,MOLM13,NOMO1,PATU8902,RD,RH30,SET2,T3M4,THP1,YAPC
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,0.149179,0.013571,-1.197798,0.261443,0.02399,-0.441891,-0.034478,-0.386031,0.185165,...,0.324981,0.042842,0.148144,0.303483,0.292166,-0.400358,0.303414,0.205641,0.031737,0.631192
1,AAK1:Kinase_domain;GAK:Kinase_domain,-0.138258,0.090206,0.798102,0.215741,-0.44103,-0.296563,0.079513,-0.882233,-0.432478,...,0.203387,-0.017059,-0.285411,0.678253,-0.194935,-0.436718,-0.0495,-0.283085,0.372368,0.368154
2,AATK:Kinase_domain;LMTK2:Kinase_domain,0.135116,0.431222,1.929479,0.27519,0.588202,0.07616,-0.11262,-0.279744,0.418585,...,0.10787,0.160368,0.269045,0.52292,0.643115,-0.360885,0.10128,0.05421,-0.248565,0.013551
3,AATK:Kinase_domain;LMTK3:Kinase_domain,0.719263,0.327775,1.239107,0.964623,0.491682,0.405034,0.254769,0.419086,0.354852,...,0.21434,0.322302,0.157412,0.377417,-0.046904,0.934033,0.233265,0.468751,0.371488,0.240569
4,ABL1:Kinase_domain;ABL2:Kinase_domain,0.167744,-0.075911,-0.372658,0.116252,0.119891,0.391952,0.046169,0.074462,0.064427,...,0.348412,0.285462,0.345206,-0.405309,0.271277,-0.090763,0.15959,0.137751,-0.022266,0.308193


In [4]:
klingbeil_gemini_data[['A1', 'A2']] = klingbeil_gemini_data['GENE_COMBINATION'].str.split(';', expand=True)
klingbeil_gemini_data['A1'] = klingbeil_gemini_data['A1'].str.split(':').str[0]
klingbeil_gemini_data['A2'] = klingbeil_gemini_data['A2'].str.split(':').str[0]
klingbeil_gemini_data[:1]

Unnamed: 0,GENE_COMBINATION,A549,ASPC1,CORL311,H1048,H1299,H1436,H1836,H209,H211,...,NOMO1,PATU8902,RD,RH30,SET2,T3M4,THP1,YAPC,A1,A2
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,0.149179,0.013571,-1.197798,0.261443,0.02399,-0.441891,-0.034478,-0.386031,0.185165,...,0.148144,0.303483,0.292166,-0.400358,0.303414,0.205641,0.031737,0.631192,AAK1,BMP2K


In [5]:
klingbeil_gemini_data[['domain_1', 'domain_2']] = klingbeil_gemini_data['GENE_COMBINATION'].str.split(';', expand=True)
klingbeil_gemini_data['domain_1'] = klingbeil_gemini_data['domain_1'].str.split(':').str[1]
klingbeil_gemini_data['domain_2'] = klingbeil_gemini_data['domain_2'].str.split(':').str[1]
klingbeil_gemini_data['domain_combination'] = klingbeil_gemini_data['domain_1'] + '_' + klingbeil_gemini_data['domain_2']
klingbeil_gemini_data[:1]

Unnamed: 0,GENE_COMBINATION,A549,ASPC1,CORL311,H1048,H1299,H1436,H1836,H209,H211,...,RH30,SET2,T3M4,THP1,YAPC,A1,A2,domain_1,domain_2,domain_combination
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,0.149179,0.013571,-1.197798,0.261443,0.02399,-0.441891,-0.034478,-0.386031,0.185165,...,-0.400358,0.303414,0.205641,0.031737,0.631192,AAK1,BMP2K,Kinase_domain,Kinase_domain,Kinase_domain_Kinase_domain


In [6]:
klingbeil_lfc_data = pd.read_excel(file_path_lfc, header=0)
klingbeil_lfc_data = klingbeil_lfc_data.rename(columns={'GENE 1':'A1', 'GENE 2':'A2'})
klingbeil_lfc_data['domain_combination'] = klingbeil_lfc_data['domain_1'] + '_' + klingbeil_lfc_data['domain_2']
klingbeil_lfc_data.head()

Unnamed: 0,A1,domain_1,A2,domain_2,A549,ASPC1,CORL311,H1048,H1299,H1436,...,MOLM13,NOMO1,PATU8902,RD,RH30,SET2,T3M4,THP1,YAPC,domain_combination
0,ACAA1,acetyltransf_domain,ACAT2,acetyltransf_domain,0.142126,0.241221,-0.106777,0.229456,-0.107029,0.104877,...,-0.234771,0.033042,-0.548081,0.206175,-0.437077,-0.054353,0.323201,0.111964,0.481936,acetyltransf_domain_acetyltransf_domain
1,ACAA1,acetyltransf_domain,CUTTING_CONTROL,nodomain,0.431196,0.111248,0.489067,0.615033,-0.568595,0.420573,...,0.31832,0.008801,-1.806178,-0.108865,1.880924,-0.05781,0.037888,0.480528,0.596389,acetyltransf_domain_nodomain
2,ACAA1,acetyltransf_domain,HADHB,acetyltransf_domain,-0.31498,0.188896,-0.071631,0.278144,-0.029604,0.525712,...,0.312417,0.148599,-1.801456,-0.053281,-0.49469,0.016748,0.075818,-0.785397,-0.11098,acetyltransf_domain_acetyltransf_domain
3,ACAA1,acetyltransf_domain,NONCUTTING_CONTROL,nodomain,0.846503,0.670848,1.50505,0.907308,0.073176,0.762751,...,0.68128,0.806813,-0.786907,0.445177,1.429772,0.259646,0.326572,0.395359,0.998877,acetyltransf_domain_nodomain
4,ACAA2,acetyltransf_domain,ACAT2,acetyltransf_domain,0.501288,0.047508,-0.576668,0.434734,0.386746,-0.399537,...,-0.35837,-0.136637,1.217597,0.176455,-0.418444,-0.040247,0.171509,0.147735,0.470465,acetyltransf_domain_acetyltransf_domain


In [7]:
#klingbeil_lfc_data.loc[klingbeil_lfc_data['A1'] == 'CUTTING_CONTROL',][:3]
#klingbeil_lfc_data.loc[klingbeil_lfc_data['A1'] == 'NONCUTTING_CONTROL',][:3]

In [8]:
klingbeil_fdr_data = pd.read_excel(file_path_fdr, header=0)
klingbeil_fdr_data.head()

Unnamed: 0,GENE_COMBINATION,A549,ASPC1,CORL311,H1048,H1299,H1436,H1836,H209,H211,...,MDAMB231,MOLM13,NOMO1,PATU8902,RD,RH30,SET2,T3M4,THP1,YAPC
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,0.998805,1.0,0.990087,0.999833,0.99871,0.984006,1.0,1.0,0.77976,...,0.999043,0.913584,0.890476,0.976135,1.0,0.992589,0.796562,0.844509,0.994081,0.872423
1,AAK1:Kinase_domain;GAK:Kinase_domain,0.99924,0.929591,0.939242,0.999833,0.99871,0.967486,0.983954,1.0,0.944687,...,0.999043,0.950717,0.987791,0.911162,1.0,0.992589,0.992691,0.997798,0.719461,0.879377
2,AATK:Kinase_domain;LMTK2:Kinase_domain,0.998805,0.639809,0.939242,0.999833,0.906256,0.911471,1.0,0.98508,0.719806,...,0.999043,0.810098,0.822935,0.947698,0.926708,0.992589,0.949712,0.968886,1.0,0.988758
3,AATK:Kinase_domain;LMTK3:Kinase_domain,0.974176,0.639809,0.939242,0.504708,0.997423,0.898025,0.922539,0.430594,0.737916,...,0.999043,0.715139,0.886476,0.968578,1.0,0.779967,0.862029,0.62618,0.720471,0.910421
4,ABL1:Kinase_domain;ABL2:Kinase_domain,0.998805,1.0,0.987293,0.999833,0.99871,0.898025,0.984334,0.803632,0.813953,...,0.999043,0.733614,0.764939,0.999081,1.0,0.981676,0.914717,0.922679,1.0,0.896882


In [9]:
klingbeil_fdr_data[['A1', 'A2']] = klingbeil_fdr_data['GENE_COMBINATION'].str.split(';', expand=True)
klingbeil_fdr_data['A1'] = klingbeil_fdr_data['A1'].str.split(':').str[0]
klingbeil_fdr_data['A2'] = klingbeil_fdr_data['A2'].str.split(':').str[0]
klingbeil_fdr_data[:1]

Unnamed: 0,GENE_COMBINATION,A549,ASPC1,CORL311,H1048,H1299,H1436,H1836,H209,H211,...,NOMO1,PATU8902,RD,RH30,SET2,T3M4,THP1,YAPC,A1,A2
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,0.998805,1.0,0.990087,0.999833,0.99871,0.984006,1.0,1.0,0.77976,...,0.890476,0.976135,1.0,0.992589,0.796562,0.844509,0.994081,0.872423,AAK1,BMP2K


In [10]:
klingbeil_fdr_data[['domain_1', 'domain_2']] = klingbeil_fdr_data['GENE_COMBINATION'].str.split(';', expand=True)
klingbeil_fdr_data['domain_1'] = klingbeil_fdr_data['domain_1'].str.split(':').str[1]
klingbeil_fdr_data['domain_2'] = klingbeil_fdr_data['domain_2'].str.split(':').str[1]
klingbeil_fdr_data['domain_combination'] = klingbeil_fdr_data['domain_1'] + '_' + klingbeil_fdr_data['domain_2']
klingbeil_fdr_data[:1]

Unnamed: 0,GENE_COMBINATION,A549,ASPC1,CORL311,H1048,H1299,H1436,H1836,H209,H211,...,RH30,SET2,T3M4,THP1,YAPC,A1,A2,domain_1,domain_2,domain_combination
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,0.998805,1.0,0.990087,0.999833,0.99871,0.984006,1.0,1.0,0.77976,...,0.992589,0.796562,0.844509,0.994081,0.872423,AAK1,BMP2K,Kinase_domain,Kinase_domain,Kinase_domain_Kinase_domain


### Reshape Data from Wide to Long Format

**Melt data to long format for all three datasets:**
- Convert wide format (one column per cell line) to long format
- Create separate DataFrames for GEMINI, LFC, and FDR scores

In [11]:
klingbeil_gemini_df = klingbeil_gemini_data.melt(id_vars=['GENE_COMBINATION', 'domain_combination', 'A1', 'A2'], 
                                                 value_vars=['A549', 'ASPC1', 'CORL311', 'H1048', 'H1299',
                                                             'H1436', 'H1836', 'H209', 'H211', 'HEL', 'HPAFII', 'K562', 'MDAMB231',
                                                             'MOLM13', 'NOMO1', 'PATU8902', 'RD', 'RH30', 'SET2', 'T3M4', 'THP1',
                                                             'YAPC'], 
                                                var_name='cell_line', value_name='GEMINI')

klingbeil_lfc_df = klingbeil_lfc_data.melt(id_vars=['domain_combination', 'A1', 'A2'], 
                                           value_vars=['A549', 'ASPC1', 'CORL311', 'H1048', 'H1299',
                                                        'H1436', 'H1836', 'H209', 'H211', 'HEL', 'HPAFII', 'K562', 'MDAMB231',
                                                        'MOLM13', 'NOMO1', 'PATU8902', 'RD', 'RH30', 'SET2', 'T3M4', 'THP1',
                                                        'YAPC'], 
                                            var_name='cell_line', value_name='LFC')

klingbeil_fdr_df = klingbeil_fdr_data.melt(id_vars=['GENE_COMBINATION', 'domain_combination', 'A1', 'A2'], 
                                           value_vars=['A549', 'ASPC1', 'CORL311', 'H1048', 'H1299',
                                                             'H1436', 'H1836', 'H209', 'H211', 'HEL', 'HPAFII', 'K562', 'MDAMB231',
                                                             'MOLM13', 'NOMO1', 'PATU8902', 'RD', 'RH30', 'SET2', 'T3M4', 'THP1',
                                                             'YAPC'], 
                                            var_name='cell_line', value_name='FDR')

### Merge Datasets

**Combine GEMINI, FDR, and LFC data:**

In [12]:
klingbeil_df = pd.merge(klingbeil_gemini_df, klingbeil_fdr_df, on=['GENE_COMBINATION', 'domain_combination', 'A1', 'A2', 'cell_line'], how='inner')
klingbeil_df[:3]

Unnamed: 0,GENE_COMBINATION,domain_combination,A1,A2,cell_line,GEMINI,FDR
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1,BMP2K,A549,0.149179,0.998805
1,AAK1:Kinase_domain;GAK:Kinase_domain,Kinase_domain_Kinase_domain,AAK1,GAK,A549,-0.138258,0.99924
2,AATK:Kinase_domain;LMTK2:Kinase_domain,Kinase_domain_Kinase_domain,AATK,LMTK2,A549,0.135116,0.998805


In [13]:
klingbeil_df = pd.merge(klingbeil_df, klingbeil_lfc_df, on=['domain_combination', 'A1', 'A2', 'cell_line'], how='left')
klingbeil_df[:3]

Unnamed: 0,GENE_COMBINATION,domain_combination,A1,A2,cell_line,GEMINI,FDR,LFC
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1,BMP2K,A549,0.149179,0.998805,0.662254
1,AAK1:Kinase_domain;GAK:Kinase_domain,Kinase_domain_Kinase_domain,AAK1,GAK,A549,-0.138258,0.99924,-0.442264
2,AATK:Kinase_domain;LMTK2:Kinase_domain,Kinase_domain_Kinase_domain,AATK,LMTK2,A549,0.135116,0.998805,0.231514


In [14]:
klingbeil_df['GENE_COMBINATION'].nunique() == klingbeil_fdr_data.shape[0]

True

### Map Gene Symbols to Entrez IDs

**Load gene mapping data and create mapping dictionaries:**

In [15]:
# read the gene names mapping file
id_map = pd.read_csv(file_path_genenames)

# create dictionaries to map gene symbols to Entrez IDs
approved_sym_to_entrez_id = dict(zip(id_map['Approved symbol'], id_map['entrez_id']))
prev_sym_to_entrez_id = dict(zip(id_map['Previous symbol'], id_map['entrez_id']))

In [16]:
# Assign mapped NCBI Gene IDs to A1 and A2
klingbeil_df = klingbeil_df.assign(
    A1_entrez = klingbeil_df['A1'].map(approved_sym_to_entrez_id),
    A2_entrez = klingbeil_df['A2'].map(approved_sym_to_entrez_id))

In [17]:
display(klingbeil_df.loc[klingbeil_df['A1_entrez'].isna(), 'A1'].unique())
display(klingbeil_df.loc[klingbeil_df['A2_entrez'].isna(), 'A2'].unique())
missing_gene_symbols = set(klingbeil_df.loc[klingbeil_df['A1_entrez'].isna(), 'A1'].unique()).union(set(klingbeil_df.loc[klingbeil_df['A2_entrez'].isna(), 'A2'].unique()))

array(['ACPP', 'C16orf13', 'C19orf35', 'C3orf58', 'COL4A3BP', 'DUPD1',
       'DUSP13', 'FAM109A', 'FAM69A', 'G6PC', 'GRAMD2', 'GSG2', 'HDGFRP2',
       'HDGFRP3', 'ICK', 'KIAA1456', 'KIAA1804', 'LOC100996792',
       'LOC102724428', 'LOC107986532', 'METTL11B', 'METTL7A', 'MUM1'],
      dtype=object)

array(['LOC107984026', 'ACPP', 'ACPT', 'KIAA1456', 'ALPPL2', 'IMPAD1',
       'KIAA2026', 'WBSCR22', 'WBSCR27', 'CXorf36', 'PRPF4B', 'DUSP13',
       'DUSP27', 'METTL12', 'FAM109B', 'FAM69B', 'FAM69C', 'HDGFRP3',
       'GRAMD3', 'ZAK', 'RRNAD1', 'METTL7A', 'METTL7B', 'MUM1L1',
       'PPP5D1', 'SGK494'], dtype=object)

In [18]:
klingbeil_df = klingbeil_df.assign(
    A1_entrez = klingbeil_df['A1_entrez'].fillna(klingbeil_df['A1'].map(prev_sym_to_entrez_id)),
    A2_entrez = klingbeil_df['A2_entrez'].fillna(klingbeil_df['A2'].map(prev_sym_to_entrez_id)))

In [19]:
display(klingbeil_df.loc[klingbeil_df['A1_entrez'].isna(), 'A1'].unique())
display(klingbeil_df.loc[klingbeil_df['A2_entrez'].isna(), 'A2'].unique())

array(['HDGFRP2', 'HDGFRP3', 'KIAA1804', 'LOC100996792', 'LOC102724428',
       'LOC107986532'], dtype=object)

array(['LOC107984026', 'HDGFRP3', 'ZAK', 'SGK494'], dtype=object)

### Standardize Gene Symbols

**Convert Entrez IDs back to approved gene symbols for consistency:**

In [20]:
klingbeil_df = klingbeil_df.rename(columns={'A1': 'org_A1', 'A2': 'org_A2'})

entrezid_to_symbol = dict(zip(id_map['entrez_id'], id_map['Approved symbol']))

klingbeil_df = klingbeil_df.assign(
    A1 = klingbeil_df['A1_entrez'].map(entrezid_to_symbol),
    A2 = klingbeil_df['A2_entrez'].map(entrezid_to_symbol))

klingbeil_df = klingbeil_df.dropna(subset=['A1_entrez', 'A2_entrez'], how='any').reset_index(drop=True)

### Create Sorted Gene Pairs

**Generate standardized gene pair identifiers:**

In [21]:
list_c = [[x, y] for x, y in zip(klingbeil_df.A1, klingbeil_df.A2)]
genepairs = ['_'.join(natsorted(pair)) for pair in list_c]
klingbeil_df.insert(2, 'sorted_genepair', genepairs, True)

In [22]:
klingbeil_df[:3]

Unnamed: 0,GENE_COMBINATION,domain_combination,sorted_genepair,org_A1,org_A2,cell_line,GEMINI,FDR,LFC,A1_entrez,A2_entrez,A1,A2
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,A549,0.149179,0.998805,0.662254,22848.0,55589.0,AAK1,BMP2K
1,AAK1:Kinase_domain;GAK:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_GAK,AAK1,GAK,A549,-0.138258,0.99924,-0.442264,22848.0,2580.0,AAK1,GAK
2,AATK:Kinase_domain;LMTK2:Kinase_domain,Kinase_domain_Kinase_domain,AATK_LMTK2,AATK,LMTK2,A549,0.135116,0.998805,0.231514,9625.0,22853.0,AATK,LMTK2


**Test specific gene pairs:**

In [23]:
klingbeil_df.loc[klingbeil_df['sorted_genepair'] == 'AKT1_AKT2', ]

Unnamed: 0,GENE_COMBINATION,domain_combination,sorted_genepair,org_A1,org_A2,cell_line,GEMINI,FDR,LFC,A1_entrez,A2_entrez,A1,A2
140,AKT1:Kinase_domain;AKT2:Kinase_domain,Kinase_domain_Kinase_domain,AKT1_AKT2,AKT1,AKT2,A549,1.002888,0.9741756,-1.301688,207.0,208.0,AKT1,AKT2
150,AKT1:phosphatase_and_phosphat_signalling_domai...,phosphatase_and_phosphat_signalling_domain_pho...,AKT1_AKT2,AKT1,AKT2,A549,0.797873,0.9741756,-0.963037,207.0,208.0,AKT1,AKT2
2830,AKT1:Kinase_domain;AKT2:Kinase_domain,Kinase_domain_Kinase_domain,AKT1_AKT2,AKT1,AKT2,ASPC1,3.431455,2.591273e-15,-3.806678,207.0,208.0,AKT1,AKT2
2840,AKT1:phosphatase_and_phosphat_signalling_domai...,phosphatase_and_phosphat_signalling_domain_pho...,AKT1_AKT2,AKT1,AKT2,ASPC1,2.62175,1.409003e-08,-2.620907,207.0,208.0,AKT1,AKT2
5520,AKT1:Kinase_domain;AKT2:Kinase_domain,Kinase_domain_Kinase_domain,AKT1_AKT2,AKT1,AKT2,CORL311,-0.626738,0.9899218,-0.418664,207.0,208.0,AKT1,AKT2
5530,AKT1:phosphatase_and_phosphat_signalling_domai...,phosphatase_and_phosphat_signalling_domain_pho...,AKT1_AKT2,AKT1,AKT2,CORL311,-0.748804,0.9899218,-0.022227,207.0,208.0,AKT1,AKT2
8210,AKT1:Kinase_domain;AKT2:Kinase_domain,Kinase_domain_Kinase_domain,AKT1_AKT2,AKT1,AKT2,H1048,2.613706,3.390035e-08,-3.67511,207.0,208.0,AKT1,AKT2
8220,AKT1:phosphatase_and_phosphat_signalling_domai...,phosphatase_and_phosphat_signalling_domain_pho...,AKT1_AKT2,AKT1,AKT2,H1048,1.934156,0.0003238647,-2.30809,207.0,208.0,AKT1,AKT2
10900,AKT1:Kinase_domain;AKT2:Kinase_domain,Kinase_domain_Kinase_domain,AKT1_AKT2,AKT1,AKT2,H1299,-0.074551,0.9987095,-1.378366,207.0,208.0,AKT1,AKT2
10910,AKT1:phosphatase_and_phosphat_signalling_domai...,phosphatase_and_phosphat_signalling_domain_pho...,AKT1_AKT2,AKT1,AKT2,H1299,0.529044,0.9741154,-1.350943,207.0,208.0,AKT1,AKT2


In [24]:
klingbeil_df.loc[klingbeil_df['sorted_genepair'] == 'KMT2A_KMT2B', ]

Unnamed: 0,GENE_COMBINATION,domain_combination,sorted_genepair,org_A1,org_A2,cell_line,GEMINI,FDR,LFC,A1_entrez,A2_entrez,A1,A2
1556,KMT2A:BROMO_domain;KMT2B:BROMO_domain,BROMO_domain_BROMO_domain,KMT2A_KMT2B,KMT2A,KMT2B,A549,1.470380,0.621634,-1.343546,4297.0,9757.0,KMT2A,KMT2B
1557,KMT2A:CXXC_domain;KMT2B:CXXC_domain,CXXC_domain_CXXC_domain,KMT2A_KMT2B,KMT2A,KMT2B,A549,1.763172,0.226488,-1.904824,4297.0,9757.0,KMT2A,KMT2B
1559,KMT2A:PHD_domain;KMT2B:PHD_domain,PHD_domain_PHD_domain,KMT2A_KMT2B,KMT2A,KMT2B,A549,1.314344,0.832033,-1.633927,4297.0,9757.0,KMT2A,KMT2B
4246,KMT2A:BROMO_domain;KMT2B:BROMO_domain,BROMO_domain_BROMO_domain,KMT2A_KMT2B,KMT2A,KMT2B,ASPC1,0.725337,0.564369,-1.575733,4297.0,9757.0,KMT2A,KMT2B
4247,KMT2A:CXXC_domain;KMT2B:CXXC_domain,CXXC_domain_CXXC_domain,KMT2A_KMT2B,KMT2A,KMT2B,ASPC1,0.804111,0.466182,-2.295248,4297.0,9757.0,KMT2A,KMT2B
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55357,KMT2A:CXXC_domain;KMT2B:CXXC_domain,CXXC_domain_CXXC_domain,KMT2A_KMT2B,KMT2A,KMT2B,THP1,,,-3.077443,4297.0,9757.0,KMT2A,KMT2B
55359,KMT2A:PHD_domain;KMT2B:PHD_domain,PHD_domain_PHD_domain,KMT2A_KMT2B,KMT2A,KMT2B,THP1,0.379737,0.710570,-1.129063,4297.0,9757.0,KMT2A,KMT2B
58046,KMT2A:BROMO_domain;KMT2B:BROMO_domain,BROMO_domain_BROMO_domain,KMT2A_KMT2B,KMT2A,KMT2B,YAPC,0.833282,0.872423,-1.622147,4297.0,9757.0,KMT2A,KMT2B
58047,KMT2A:CXXC_domain;KMT2B:CXXC_domain,CXXC_domain_CXXC_domain,KMT2A_KMT2B,KMT2A,KMT2B,YAPC,1.517005,0.872423,-2.978127,4297.0,9757.0,KMT2A,KMT2B


### Add Synthetic Lethality Labels

**Define synthetic lethality based on FDR threshold:**

In [25]:
klingbeil_df.loc[klingbeil_df['FDR'].isna()]

Unnamed: 0,GENE_COMBINATION,domain_combination,sorted_genepair,org_A1,org_A2,cell_line,GEMINI,FDR,LFC,A1_entrez,A2_entrez,A1,A2
51,ACAP3:phosphatase_and_phosphat_signalling_doma...,phosphatase_and_phosphat_signalling_domain_pho...,ACAP3_AGAP5,ACAP3,AGAP5,A549,,,-3.874116,116983.0,729092.0,ACAP3,AGAP5
97,AGAP1:phosphatase_and_phosphat_signalling_doma...,phosphatase_and_phosphat_signalling_domain_pho...,AGAP1_AGAP5,AGAP1,AGAP5,A549,,,-3.481294,116987.0,729092.0,AGAP1,AGAP5
103,AGAP11:phosphatase_and_phosphat_signalling_dom...,phosphatase_and_phosphat_signalling_domain_pho...,AGAP5_AGAP11,AGAP11,AGAP5,A549,,,-3.299841,119385.0,729092.0,AGAP11,AGAP5
108,AGAP2:phosphatase_and_phosphat_signalling_doma...,phosphatase_and_phosphat_signalling_domain_pho...,AGAP2_AGAP5,AGAP2,AGAP5,A549,,,-3.850253,116986.0,729092.0,AGAP2,AGAP5
112,AGAP3:phosphatase_and_phosphat_signalling_doma...,phosphatase_and_phosphat_signalling_domain_pho...,AGAP3_AGAP5,AGAP3,AGAP5,A549,,,-3.451532,116988.0,729092.0,AGAP3,AGAP5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59165,WNK1:Kinase_domain;WNK2:Kinase_domain,Kinase_domain_Kinase_domain,WNK1_WNK2,WNK1,WNK2,YAPC,,,-5.383580,65125.0,65268.0,WNK1,WNK2
59166,WNK1:Kinase_domain;WNK3:Kinase_domain,Kinase_domain_Kinase_domain,WNK1_WNK3,WNK1,WNK3,YAPC,,,-3.895242,65125.0,65267.0,WNK1,WNK3
59167,WNK1:Kinase_domain;WNK4:Kinase_domain,Kinase_domain_Kinase_domain,WNK1_WNK4,WNK1,WNK4,YAPC,,,-5.139171,65125.0,65266.0,WNK1,WNK4
59171,YEATS2:YEATS_domain;YEATS4:YEATS_domain,YEATS_domain_YEATS_domain,YEATS2_YEATS4,YEATS2,YEATS4,YAPC,,,-4.083892,55689.0,8089.0,YEATS2,YEATS4


In [26]:
SL_condition = (klingbeil_df['FDR'] < 0.05)
klingbeil_df['SL'] = SL_condition

In [27]:
klingbeil_df['SL'].value_counts()

SL
False    58277
True       903
Name: count, dtype: int64

In [28]:
# Function to sort each pair of gene symbols and their Entrez IDs
def sort_gene_pairs(row):
    # Sort the genes alphabetically and determine new order
    sorted_genes = natsorted([row['A1'], row['A2']])
    
    # Match the sorted genes to the original ones and rearrange Entrez IDs accordingly
    if sorted_genes[0] == row['A1']:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A1_entrez'], row['A2_entrez']])
    else:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A2_entrez'], row['A1_entrez']])

# Apply the sorting to each row
df = klingbeil_df.copy()
df[['A1_sorted', 'A2_sorted', 'A1_entrez_sorted', 'A2_entrez_sorted']] = df.apply(sort_gene_pairs, axis=1)

# Drop the old columns and rename the new ones
klingbeil_df_labelled = df.drop(columns=['A1', 'A2', 'A1_entrez', 'A2_entrez']).copy()
klingbeil_df_labelled = klingbeil_df_labelled.rename(columns={
    'A1_sorted': 'A1',
    'A2_sorted': 'A2',
    'A1_entrez_sorted': 'A1_entrez',
    'A2_entrez_sorted': 'A2_entrez'
})

In [29]:
#test pair
klingbeil_df_labelled.loc[klingbeil_df_labelled['sorted_genepair'] == 'ZMYND8_ZMYND11']

Unnamed: 0,GENE_COMBINATION,domain_combination,sorted_genepair,org_A1,org_A2,cell_line,GEMINI,FDR,LFC,SL,A1,A2,A1_entrez,A2_entrez
2686,ZMYND11:ADD_domain;ZMYND8:ADD_domain,ADD_domain_ADD_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,A549,0.097400,0.998805,-0.200981,False,ZMYND8,ZMYND11,23613.0,10771.0
2687,ZMYND11:BROMO_domain;ZMYND8:BROMO_domain,BROMO_domain_BROMO_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,A549,0.260723,0.988616,-0.167358,False,ZMYND8,ZMYND11,23613.0,10771.0
2688,ZMYND11:PHD_domain;ZMYND8:PHD_domain,PHD_domain_PHD_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,A549,-0.334091,0.999240,-0.132602,False,ZMYND8,ZMYND11,23613.0,10771.0
2689,ZMYND11:PWWP_domain;ZMYND8:PWWP_domain,PWWP_domain_PWWP_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,A549,-0.245971,0.999240,-0.212266,False,ZMYND8,ZMYND11,23613.0,10771.0
5376,ZMYND11:ADD_domain;ZMYND8:ADD_domain,ADD_domain_ADD_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,ASPC1,0.771619,0.505559,-1.179333,False,ZMYND8,ZMYND11,23613.0,10771.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56489,ZMYND11:PWWP_domain;ZMYND8:PWWP_domain,PWWP_domain_PWWP_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,THP1,-0.104612,1.000000,-1.954398,False,ZMYND8,ZMYND11,23613.0,10771.0
59176,ZMYND11:ADD_domain;ZMYND8:ADD_domain,ADD_domain_ADD_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,YAPC,-0.478275,1.000000,-1.460720,False,ZMYND8,ZMYND11,23613.0,10771.0
59177,ZMYND11:BROMO_domain;ZMYND8:BROMO_domain,BROMO_domain_BROMO_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,YAPC,-0.279498,1.000000,-1.920873,False,ZMYND8,ZMYND11,23613.0,10771.0
59178,ZMYND11:PHD_domain;ZMYND8:PHD_domain,PHD_domain_PHD_domain,ZMYND8_ZMYND11,ZMYND11,ZMYND8,YAPC,0.012366,0.989382,-1.866205,False,ZMYND8,ZMYND11,23613.0,10771.0


### Add Cell Line Mapping

**Map cell line names to DepMap IDs:**

In [30]:
sample_info = pd.read_csv(file_path_sample_info)
sample_info[:2]

Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,...,primary_disease,Subtype,age,Sanger_Model_ID,depmap_public_comments,lineage,lineage_subtype,lineage_sub_subtype,lineage_molecular_subtype,culture_type
0,ACH-000001,NIH:OVCAR-3,NIHOVCAR3,NIHOVCAR3_OVARY,OVCAR3,905933.0,Female,ATCC,CVCL_0465,2201.0,...,Ovarian Cancer,"Adenocarcinoma, high grade serous",60.0,SIDM00105,,ovary,ovary_adenocarcinoma,high_grade_serous,,Adherent
1,ACH-000002,HL-60,HL60,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,905938.0,Female,ATCC,CVCL_0002,55.0,...,Leukemia,"Acute Myelogenous Leukemia (AML), M3 (Promyelo...",35.0,SIDM00829,,blood,AML,M3,,Suspension


In [31]:
CCLE_name_to_DepMapID = dict(zip(sample_info.stripped_cell_line_name, sample_info.DepMap_ID))

# update dictionary with renamed values
CCLE_name_to_DepMapID.update({'H1048': 'ACH-000866', 
                              'H1299': 'ACH-000510',
                              'H1436':'ACH-000830',
                              'H1836':'ACH-000559',
                              'H209':'ACH-000290',
                              'H211':'ACH-000639'
                              })

In [32]:
klingbeil_df_labelled.insert(5, "DepMap_ID", klingbeil_df_labelled["cell_line"].map(CCLE_name_to_DepMapID))

In [33]:
klingbeil_df_labelled = klingbeil_df_labelled[['GENE_COMBINATION', 'domain_combination', 'sorted_genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'cell_line', 'DepMap_ID', 
                                               'GEMINI', 'LFC', 'FDR', 'SL', 'org_A1', 'org_A2']].copy()
klingbeil_df_labelled = klingbeil_df_labelled.rename(columns={'sorted_genepair':'genepair'})
klingbeil_df_labelled = klingbeil_df_labelled.sort_values(by=['GENE_COMBINATION', 'domain_combination', 'genepair', 'cell_line']).reset_index(drop=True)

In [34]:
print(klingbeil_df_labelled.loc[klingbeil_df_labelled['genepair'] == 'AKT1_AKT2', ['domain_combination']].value_counts())
print(klingbeil_df_labelled.loc[klingbeil_df_labelled['genepair'] == 'AKT1_AKT2'].shape[0])

domain_combination                                                                   
Kinase_domain_Kinase_domain                                                              22
phosphatase_and_phosphat_signalling_domain_phosphatase_and_phosphat_signalling_domain    22
Name: count, dtype: int64
44


In [35]:
klingbeil_df_labelled.isna().sum()

GENE_COMBINATION         0
domain_combination       0
genepair                 0
A1                       0
A2                       0
A1_entrez                0
A2_entrez                0
cell_line                0
DepMap_ID                0
GEMINI                3431
LFC                      0
FDR                   3431
SL                       0
org_A1                   0
org_A2                   0
dtype: int64

### Save Processed Data

**Export the final processed dataset:**

In [36]:
klingbeil_df_labelled.to_csv(file_path_processed_klingbeil_df, index=False)