In [7]:
# import modules
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from natsort import natsorted

In [8]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

crispr_screens_path = get_data_path(['output', 'processed_CRISPR_screens'], '')
lfc_files_path = get_data_path(['input', 'LFC'], 'ITO_LFC.csv')
gemini_files_path = get_data_path(['input', 'GEMINI'], 'Gemini_ITO_Sensitive_Lethality.csv')
file_path_genenames = get_data_path(['input', 'other'], 'approved_and_previous_symbols.csv')

##### Processing GEMINI scores of the datasets

In [9]:
# read GEMINI scores for Ito Screen
gemini_ito = pd.read_csv(gemini_files_path)

gemini_ito_df = gemini_ito.rename(columns={'Unnamed: 0':'genepair',
                                'Meljuso':'ACH-000881', 
                                'GI1_004':'ACH-000756',
                                'MEL202_003':'ACH-001554',
                                'PK1':'ACH-000307',
                                'MEWO':'ACH-000987',
                                'HS944T':'ACH-000632',
                                'IPC298':'ACH-000915',
                                'A549':'ACH-000681',
                                'HSC5':'ACH-001524',
                                'HS936T':'ACH-000801',
                                'PATU8988S':'ACH-000022'})
gemini_ito_df['genepair'] = gemini_ito_df['genepair'].apply(lambda x: x.replace(';', '_'))
gemini_ito_df = pd.melt(gemini_ito_df, id_vars=['genepair'], var_name='DepMap_ID', value_name='GEMINI').sort_values(by='genepair').reset_index(drop=True)
print('number of unique gene pairs:', gemini_ito_df['genepair'].nunique())
gemini_ito_df.head()

number of unique gene pairs: 5065


Unnamed: 0,genepair,DepMap_ID,GEMINI
0,A3GALT2_ABO,ACH-000881,-0.137329
1,A3GALT2_ABO,ACH-000307,0.132501
2,A3GALT2_ABO,ACH-000915,0.083927
3,A3GALT2_ABO,ACH-001554,-0.077458
4,A3GALT2_ABO,ACH-001524,0.218901


#### Processing LFC of the datasets

In [10]:
# read GEMINI scores for Ito Screen
lfc_ito = pd.read_csv(lfc_files_path, index_col=0)

lfc_ito_df = lfc_ito.rename(columns={'gene_pair':'genepair',
                                'Meljuso':'ACH-000881', 
                                'GI1_004':'ACH-000756',
                                'MEL202_003':'ACH-001554',
                                'PK1':'ACH-000307',
                                'MEWO':'ACH-000987',
                                'HS944T':'ACH-000632',
                                'IPC298':'ACH-000915',
                                'A549':'ACH-000681',
                                'HSC5':'ACH-001524',
                                'HS936T':'ACH-000801',
                                'PATU8988S':'ACH-000022'})

lfc_ito_df = pd.melt(lfc_ito_df, id_vars=['genepair'], var_name='DepMap_ID', value_name='LFC').sort_values(by='genepair').reset_index(drop=True)
lfc_ito_df[:3]

Unnamed: 0,genepair,DepMap_ID,LFC
0,A3GALT2_AAVS1,ACH-000881,0.343378
1,A3GALT2_AAVS1,ACH-000681,0.166162
2,A3GALT2_AAVS1,ACH-000987,0.216834


In [11]:
ito_df = pd.merge(gemini_ito_df, lfc_ito_df, on=['genepair', 'DepMap_ID'], how='left')
ito_df[:3]

Unnamed: 0,genepair,DepMap_ID,GEMINI,LFC
0,A3GALT2_ABO,ACH-000881,-0.137329,0.283725
1,A3GALT2_ABO,ACH-000307,0.132501,0.201704
2,A3GALT2_ABO,ACH-000915,0.083927,-0.026881


In [12]:
# read the gene names mapping file
id_map = pd.read_csv(file_path_genenames)

# create dictionaries to map gene symbols to Entrez IDs
approved_sym_to_entrez_id = dict(zip(id_map['Approved symbol'], id_map['entrez_id']))
entrezid_to_symbol = dict(zip(id_map['entrez_id'], id_map['Approved symbol']))

# create dictionaries to map previous gene symbols to Entrez IDs
id_map_cleaned = id_map.dropna(axis=0, how='any', subset=['Previous symbol', 'entrez_id']).reset_index(drop=True)
prev_sym_to_entrez_id = dict(zip(id_map_cleaned['Previous symbol'], id_map_cleaned['entrez_id']))

In [13]:
def process_gene_symbols(gemini_df, id_map):
    # Split gene pairs into two columns
    gemini_df.insert(1, "A1", gemini_df['genepair'].apply(lambda x: x.split("_", 1)[0]))
    gemini_df.insert(2, "A2", gemini_df['genepair'].apply(lambda x: x.split("_", 1)[1]))

    # Assign mapped NCBI Gene IDs to A1 and A2
    gemini_df = gemini_df.assign(
        A1_entrez = gemini_df['A1'].map(approved_sym_to_entrez_id),
        A2_entrez = gemini_df['A2'].map(approved_sym_to_entrez_id))

    gemini_df['A1_entrez'] = gemini_df['A1_entrez'].fillna(gemini_df['A1'].map(prev_sym_to_entrez_id))
    gemini_df['A2_entrez'] = gemini_df['A2_entrez'].fillna(gemini_df['A2'].map(prev_sym_to_entrez_id))

    # Drop rows with unresolved NCBI Gene IDs
    gemini_df = gemini_df.dropna(subset=['A1_entrez', 'A2_entrez'], how='any').reset_index(drop=True)
    gemini_df = gemini_df.drop('genepair', axis=1)

    gemini_df.rename(columns={'A1': 'org_A1', 'A2': 'org_A2'}, inplace=True)

    gemini_df.insert(1, 'A1', gemini_df['A1_entrez'].map(entrezid_to_symbol))
    gemini_df.insert(2, 'A2', gemini_df['A2_entrez'].map(entrezid_to_symbol))

    list_c = [[x, y] for x, y in zip(gemini_df.A1, gemini_df.A2)]
    genepairs = ['_'.join(natsorted(pair)) for pair in list_c]
    gemini_df.insert(0, 'genepair', genepairs, True)

    gemini_df = gemini_df[['genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'DepMap_ID', 'GEMINI', 'LFC', 'org_A1', 'org_A2']]
    return gemini_df
    

In [14]:
gemini_ito_df_melt = process_gene_symbols(ito_df, id_map).sort_values(by='genepair').reset_index(drop=True)

In [15]:
# Function to sort each pair of gene symbols and their Entrez IDs
def sort_gene_pairs(row):
    # Sort the genes alphabetically and determine new order
    sorted_genes = natsorted([row['A1'], row['A2']])
    
    # Match the sorted genes to the original ones and rearrange Entrez IDs accordingly
    if sorted_genes[0] == row['A1']:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A1_entrez'], row['A2_entrez']])
    else:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A2_entrez'], row['A1_entrez']])

In [16]:
# Define the function to process the DataFrame
def process_dataframe(df):
    # Apply the sorting to each row
    df[['A1_sorted', 'A2_sorted', 'A1_entrez_sorted', 'A2_entrez_sorted']] = df.apply(sort_gene_pairs, axis=1)
    
    # Drop the old columns and rename the new ones
    df = df.drop(columns=['A1', 'A2', 'A1_entrez', 'A2_entrez']).copy()
    df = df.rename(columns={
        'A1_sorted': 'A1',
        'A2_sorted': 'A2',
        'A1_entrez_sorted': 'A1_entrez',
        'A2_entrez_sorted': 'A2_entrez'
    })
    
    return df

In [17]:
gemini_ito_df = process_dataframe(gemini_ito_df_melt)

In [18]:
display(gemini_ito_df.loc[gemini_ito_df['genepair'] == 'ADAM8_ADAM12'])

Unnamed: 0,genepair,DepMap_ID,GEMINI,LFC,org_A1,org_A2,A1,A2,A1_entrez,A2_entrez
1661,ADAM8_ADAM12,ACH-000801,0.542718,-0.401475,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1662,ADAM8_ADAM12,ACH-000881,0.138926,-0.04363,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1663,ADAM8_ADAM12,ACH-001554,0.222868,-0.064732,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1664,ADAM8_ADAM12,ACH-001524,0.372903,-0.209467,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1665,ADAM8_ADAM12,ACH-000681,0.4825,-0.51374,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1666,ADAM8_ADAM12,ACH-000632,0.298943,-0.170243,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1667,ADAM8_ADAM12,ACH-000987,0.189274,-0.063444,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1668,ADAM8_ADAM12,ACH-000756,0.041007,0.049023,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1669,ADAM8_ADAM12,ACH-000915,0.13391,0.010868,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0
1670,ADAM8_ADAM12,ACH-000307,0.079471,-0.104993,ADAM12,ADAM8,ADAM8,ADAM12,101.0,8038.0


In [20]:
def get_target_files(folder, pattern_suffix='_scored'):
    """Return sorted list of CSV file paths and their base names (without _scored suffix) from a folder."""
    csv_files = [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.csv') and pattern_suffix in f
    ]
    csv_files = sorted(csv_files)
    # Extract base names without the pattern suffix and .csv extension
    filenames = [
        os.path.splitext(os.path.basename(f))[0].replace(pattern_suffix, '') 
        for f in csv_files
    ]
    return csv_files, filenames

In [21]:
crispr_files, filenames = get_target_files(crispr_screens_path)
crispr_files

['/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_scored.csv',
 '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_klingbeil_df_scored.csv']

In [22]:
ito_annotated = pd.read_csv(crispr_files[0])
ito_annotated = pd.merge(ito_annotated, gemini_ito_df.drop(['A1', 'A2', 'org_A2', 'org_A1', 'A1_entrez', 'A2_entrez'], axis=1), 
                         on=['genepair', 'DepMap_ID'], how='left')
ito_annotated[:3]

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,...,mean_complex_essentiality,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,GEMINI,LFC
0,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,...,0.0,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.118768,0.088856
1,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000307,PK1_PANCREAS,0.986587,0.201704,False,...,0.0,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.132501,0.201704
2,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000632,HS944T_SKIN,1.0,0.069772,False,...,0.0,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.024593,0.069772


In [23]:
klignbeil_annotated = pd.read_csv(crispr_files[1])
klignbeil_annotated[:3]

Unnamed: 0,GENE_COMBINATION,domain_combination,genepair,A1,A2,A1_entrez,A2_entrez,cell_line,DepMap_ID,GEMINI,...,mean_age,either_in_complex,mean_complex_essentiality,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848.0,55589.0,HEL,ACH-000004,0.218665,...,976.25,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786
1,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848.0,55589.0,T3M4,ACH-000085,0.205641,...,976.25,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786
2,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848.0,55589.0,HPAFII,ACH-000094,0.044486,...,976.25,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786


In [24]:
annotated_datasets = [ito_annotated, klignbeil_annotated]

In [25]:
def label_top_10(df, lfc_threshold, label):
    
    # Drop rows where GEMINI is NaN
    df = df.dropna(subset=['GEMINI']).copy()

    # Check if the DataFrame is empty after dropping NaN values
    if df.empty:
        raise ValueError("The DataFrame is empty after dropping rows with NaN values in 'GEMINI'.")
    
    # Calculate the threshold for top 10%  
    percentile = 90
    threshold = np.percentile(df['GEMINI'], percentile)

    # Label the top 10% 
    df[label] = (df['GEMINI'] >= threshold) & (df['LFC'] < lfc_threshold)

    return df

In [26]:
# ito
ito_lfc_threshold = -0.51
annotated_datasets[0] = label_top_10(annotated_datasets[0], ito_lfc_threshold, 'SL_new')

print('ito')
display(annotated_datasets[0][:3])
print('')

# parrish
#parrish_lfc_threshold = -0.63
#annotated_target_pairs[1] = label_top_10(annotated_target_pairs[1], parrish_lfc_threshold, 'SL_new')

#print('parrish')
#display(annotated_target_pairs[1][:3])
#print('')

# kln
kln_lfc_threshold = -1.03
annotated_datasets[1] = label_top_10(annotated_datasets[1], kln_lfc_threshold, 'SL_new')

print('Kln')
display(annotated_datasets[1][:3])
print('')

ito


Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,...,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,GEMINI,LFC,SL_new
0,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.118768,0.088856,False
1,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000307,PK1_PANCREAS,0.986587,0.201704,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.132501,0.201704,False
2,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000632,HS944T_SKIN,1.0,0.069772,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.024593,0.069772,False



Kln


Unnamed: 0,GENE_COMBINATION,domain_combination,genepair,A1,A2,A1_entrez,A2_entrez,cell_line,DepMap_ID,GEMINI,...,either_in_complex,mean_complex_essentiality,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,SL_new
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848.0,55589.0,HEL,ACH-000004,0.218665,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
1,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848.0,55589.0,T3M4,ACH-000085,0.205641,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False
2,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848.0,55589.0,HPAFII,ACH-000094,0.044486,...,False,0.0,0.0,False,77.0,21.867726,0.261701,6.713555,6.761786,False





In [27]:
print('ito')
display(annotated_datasets[0]['SL_new'].value_counts())
print('')

#print('parrish')
#display(annotated_target_pairs[1]['SL_new'].value_counts())
#print('')

print('Kln')
display(annotated_datasets[1]['SL_new'].value_counts())
print('')

ito


SL_new
False    48138
True      1018
Name: count, dtype: int64


Kln


SL_new
False    50878
True      2009
Name: count, dtype: int64




In [28]:
output_dir = get_data_path(['output', 'processed_CRISPR_screens'], '')

for i, df in enumerate(annotated_datasets):
    base_filename = filenames[i]
    output_path = os.path.join(output_dir, f"{base_filename}_labelled.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Saved: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_ito_df_labelled.csv
Saved: /Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/processed_CRISPR_screens/processed_klingbeil_df_labelled.csv
