### Process data from the DepMap project

In [2]:
# import modules
import os
import re
import pandas as pd
import numpy as np

In [3]:
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_training_data = get_data_path(['output', 'processed_CRISPR_screens'], 'processed_ito_df_labelled.csv')
file_path_testing_data = get_data_path(['output', 'processed_CRISPR_screens'], 'processed_klingbeil_df_labelled.csv')
dekegel_table8_path = get_data_path(['input', 'other'], 'processed_DeKegel_TableS8.csv')

In [4]:
training_df = pd.read_csv(file_path_training_data, low_memory=False)
training_df.head()

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,...,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,GEMINI,LFC,SL_new
0,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.118768,0.088856,False
1,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000307,PK1_PANCREAS,0.986587,0.201704,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.132501,0.201704,False
2,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000632,HS944T_SKIN,1.0,0.069772,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.024593,0.069772,False
3,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000681,A549_LUNG,0.977988,0.379455,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,-0.241323,0.379455,False
4,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000756,GI1_CENTRAL_NERVOUS_SYSTEM,0.999586,-0.077118,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.299715,-0.077118,False


In [5]:
testing_df = pd.read_csv(file_path_testing_data, low_memory=False)
testing_df.head()

Unnamed: 0,GENE_COMBINATION,domain_combination,genepair,A1,A2,A1_entrez,A2_entrez,cell_line,DepMap_ID,GEMINI,...,either_in_complex,mean_complex_essentiality,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,SL_new
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,HEL,ACH-000004,0.218665,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
1,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,T3M4,ACH-000085,0.205641,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
2,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,HPAFII,ACH-000094,0.044486,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
3,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,THP1,ACH-000146,0.031737,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
4,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,NOMO1,ACH-000168,0.148144,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False


In [6]:
paralog_pairs = pd.read_csv(dekegel_table8_path)

In [7]:
# summary 
training_df[['rMaxExp_A1A2', 'rMinExp_A1A2',
             'max_ranked_A1A2', 'min_ranked_A1A2',
             'max_cn', 'min_cn', 'Protein_Altering', 'Damaging', 
             'min_sequence_identity',
             'prediction_score', 
             'weighted_PPI_essentiality', 'weighted_PPI_expression',
             'smallest_GO_ranked_ess', 'smallest_GO_CC_ranked_ess',
             'smallest_gene_expression', 'go_CC_expression', 'closest', 'WGD', 'family_size',
             'cds_length_ratio', 'shared_domains', 'has_pombe_ortholog',
             'has_essential_pombe_ortholog', 'has_cerevisiae_ortholog', 'has_essential_cerevisiae_ortholog', 
             'conservation_score', 'mean_age', 'either_in_complex', 'mean_complex_essentiality', 'colocalisation',
             'interact', 'n_total_ppi', 'fet_ppi_overlap',
             'gtex_spearman_corr', 'gtex_min_mean_expr', 'gtex_max_mean_expr']].isna().sum()

rMaxExp_A1A2                            11
rMinExp_A1A2                            11
max_ranked_A1A2                       4926
min_ranked_A1A2                       4926
max_cn                                   5
min_cn                                   5
Protein_Altering                         0
Damaging                                 0
min_sequence_identity                 3347
prediction_score                      3347
weighted_PPI_essentiality             7901
weighted_PPI_expression               3793
smallest_GO_ranked_ess               26137
smallest_GO_CC_ranked_ess            40788
smallest_gene_expression             23970
go_CC_expression                     39944
closest                                  0
WGD                                      0
family_size                           3347
cds_length_ratio                      3347
shared_domains                        3347
has_pombe_ortholog                       0
has_essential_pombe_ortholog          3347
has_cerevis

In [8]:
def preprocess_dataset(df, old_df,
                       required_genepairs_col='genepair',
                       dropna_cols=None,
                       fillna_zero_cols=None,
                       fillna_large_cols=None,
                       fillna_large_value=18000):
    """
    Preprocess a training or testing dataset:
    - Keep only rows with genepairs present in `old_df`
    - Drop rows with NaN in specific columns
    - Fill missing values with default values
    
    Parameters:
        df (pd.DataFrame): Dataset to process
        old_df (pd.DataFrame): Dataset with allowed genepairs
        required_genepairs_col (str): Column name for genepairs to match
        dropna_cols (list): Columns for which rows with NaNs should be dropped
        fillna_zero_cols (list): Columns to fill NaNs with 0
        fillna_large_cols (list): Columns to fill NaNs with large constant
        fillna_large_value (int or float): The large value to fill (default: 18000)

    Returns:
        pd.DataFrame: Cleaned and processed DataFrame
    """

    # Step 1: Filter rows to only those in old_df
    df_filtered = df[df[required_genepairs_col].isin(old_df[required_genepairs_col])].copy()

    # Step 2: Drop rows with any NA in required columns
    if dropna_cols:
        df_filtered = df_filtered.dropna(axis=0, how='any', subset=dropna_cols)

    # Step 3: Fill NaNs with 0 or large number
    if fillna_zero_cols:
        df_filtered[fillna_zero_cols] = df_filtered[fillna_zero_cols].fillna(0)

    if fillna_large_cols:
        df_filtered[fillna_large_cols] = df_filtered[fillna_large_cols].fillna(fillna_large_value)

    # Step 4: Reset index for clean result
    return df_filtered.reset_index(drop=True)

In [9]:
drop_na_values = ['rMaxExp_A1A2', 'rMinExp_A1A2', 'max_ranked_A1A2', 'min_ranked_A1A2']
fillna_values = ['weighted_PPI_expression', 'smallest_gene_expression', 'go_CC_expression']
fillna_values_v2 = ['weighted_PPI_essentiality', 'smallest_GO_ranked_ess', 'smallest_GO_CC_ranked_ess']

# Apply to training set
training_df_clean = preprocess_dataset(
    df=training_df,
    old_df=paralog_pairs,
    dropna_cols=drop_na_values,
    fillna_zero_cols=fillna_values,
    fillna_large_cols=fillna_values_v2
)

# Apply to testing set
testing_df_clean = preprocess_dataset(
    df=testing_df,
    old_df=paralog_pairs,
    dropna_cols=drop_na_values,
    fillna_zero_cols=fillna_values,
    fillna_large_cols=fillna_values_v2
)

In [34]:
#summary of the training dataset after removing NA values
print('Num SL:', training_df_clean[training_df_clean['SL_new'] == True].shape[0], '/', training_df_clean.shape[0])
print('Num non-SL:', training_df_clean[training_df_clean['SL_new'] == False].shape[0], '/', training_df_clean.shape[0])
print(f'Number of unique gene pairs: {training_df_clean.genepair.nunique()}')
print(f'Number of unique cell lines: {training_df_clean.cell_line.nunique()}')
training_df_clean[:3]

Num SL: 958 / 41244
Num non-SL: 40286 / 41244
Number of unique gene pairs: 4170
Number of unique cell lines: 10


Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,...,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,GEMINI,LFC,SL_new
0,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.118768,0.088856,False
1,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000307,PK1_PANCREAS,0.986587,0.201704,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.132501,0.201704,False
2,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000632,HS944T_SKIN,1.0,0.069772,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.024593,0.069772,False


In [36]:
# summary after filling NA values
training_df_clean[['rMaxExp_A1A2', 'rMinExp_A1A2',
             'max_ranked_A1A2', 'min_ranked_A1A2',
             'max_cn', 'min_cn', 'Protein_Altering', 'Damaging', 
             'min_sequence_identity',
             'prediction_score', 
             'weighted_PPI_essentiality', 'weighted_PPI_expression',
             'smallest_GO_ranked_ess', 'smallest_GO_CC_ranked_ess',
             'smallest_gene_expression', 'go_CC_expression', 'closest', 'WGD', 'family_size',
             'cds_length_ratio', 'shared_domains', 'has_pombe_ortholog',
             'has_essential_pombe_ortholog', 'has_cerevisiae_ortholog', 'has_essential_cerevisiae_ortholog', 
             'conservation_score', 'mean_age', 'either_in_complex', 'mean_complex_essentiality', 'colocalisation',
             'interact', 'n_total_ppi', 'fet_ppi_overlap',
             'gtex_spearman_corr', 'gtex_min_mean_expr', 'gtex_max_mean_expr']].isna().sum()

rMaxExp_A1A2                         0
rMinExp_A1A2                         0
max_ranked_A1A2                      0
min_ranked_A1A2                      0
max_cn                               0
min_cn                               0
Protein_Altering                     0
Damaging                             0
min_sequence_identity                0
prediction_score                     0
weighted_PPI_essentiality            0
weighted_PPI_expression              0
smallest_GO_ranked_ess               0
smallest_GO_CC_ranked_ess            0
smallest_gene_expression             0
go_CC_expression                     0
closest                              0
WGD                                  0
family_size                          0
cds_length_ratio                     0
shared_domains                       0
has_pombe_ortholog                   0
has_essential_pombe_ortholog         0
has_cerevisiae_ortholog              0
has_essential_cerevisiae_ortholog    0
conservation_score       

In [10]:
#summary of the training dataset after removing NA values
print('Num SL:', testing_df_clean[testing_df_clean['SL_new'] == True].shape[0], '/', testing_df_clean.shape[0])
print('Num non-SL:', testing_df_clean[testing_df_clean['SL_new'] == False].shape[0], '/', testing_df_clean.shape[0])
print(f'Number of unique gene pairs: {testing_df_clean.genepair.nunique()}')
print(f'Number of unique cell lines: {testing_df_clean.cell_line.nunique()}')
testing_df_clean[:3]

Num SL: 1608 / 38342
Num non-SL: 36734 / 38342
Number of unique gene pairs: 1704
Number of unique cell lines: 21


Unnamed: 0,GENE_COMBINATION,domain_combination,genepair,A1,A2,A1_entrez,A2_entrez,cell_line,DepMap_ID,GEMINI,...,either_in_complex,mean_complex_essentiality,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,SL_new
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,HEL,ACH-000004,0.218665,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
1,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,T3M4,ACH-000085,0.205641,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
2,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,HPAFII,ACH-000094,0.044486,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False


In [11]:
testing_df_clean[['rMaxExp_A1A2', 'rMinExp_A1A2',
             'max_ranked_A1A2', 'min_ranked_A1A2',
             'max_cn', 'min_cn', 'Protein_Altering', 'Damaging', 
             'min_sequence_identity',
             'prediction_score', 
             'weighted_PPI_essentiality', 'weighted_PPI_expression',
             'smallest_GO_ranked_ess', 'smallest_GO_CC_ranked_ess',
             'smallest_gene_expression', 'go_CC_expression', 'closest', 'WGD', 'family_size',
             'cds_length_ratio', 'shared_domains', 'has_pombe_ortholog',
             'has_essential_pombe_ortholog', 'has_cerevisiae_ortholog', 'has_essential_cerevisiae_ortholog', 
             'conservation_score', 'mean_age', 'either_in_complex', 'mean_complex_essentiality', 'colocalisation',
             'interact', 'n_total_ppi', 'fet_ppi_overlap',
             'gtex_spearman_corr', 'gtex_min_mean_expr', 'gtex_max_mean_expr']].isna().sum()

rMaxExp_A1A2                         0
rMinExp_A1A2                         0
max_ranked_A1A2                      0
min_ranked_A1A2                      0
max_cn                               0
min_cn                               0
Protein_Altering                     0
Damaging                             0
min_sequence_identity                0
prediction_score                     0
weighted_PPI_essentiality            0
weighted_PPI_expression              0
smallest_GO_ranked_ess               0
smallest_GO_CC_ranked_ess            0
smallest_gene_expression             0
go_CC_expression                     0
closest                              0
WGD                                  0
family_size                          0
cds_length_ratio                     0
shared_domains                       0
has_pombe_ortholog                   0
has_essential_pombe_ortholog         0
has_cerevisiae_ortholog              0
has_essential_cerevisiae_ortholog    0
conservation_score       

In [12]:
# save the files
output_path = get_data_path(['output', 'models'], '')
training_df_clean.to_csv(os.path.join(output_path, 'training_data.csv'), index=False)
testing_df_clean.to_csv(os.path.join(output_path, 'testing_data.csv'), index=False)