In [8]:
# import modules
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from natsort import natsorted

In [18]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

crispr_screens_path = get_data_path(['output', 'processed_CRISPR_screens'], '')
lfc_files_path = get_data_path(['input', 'LFC'], 'ITO_LFC.csv')
gemini_files_path = get_data_path(['input', 'GEMINI'], 'Gemini_ITO_Sensitive_Lethality.csv')
file_path_genenames = get_data_path(['input', 'other'], 'genenames.txt')

##### Processing GEMINI scores of the datasets

In [14]:
# read GEMINI scores for Ito Screen
gemini_ito = pd.read_csv(gemini_files_path)

gemini_ito_df = gemini_ito.rename(columns={'Unnamed: 0':'genepair',
                                'Meljuso':'ACH-000881', 
                                'GI1_004':'ACH-000756',
                                'MEL202_003':'ACH-001554',
                                'PK1':'ACH-000307',
                                'MEWO':'ACH-000987',
                                'HS944T':'ACH-000632',
                                'IPC298':'ACH-000915',
                                'A549':'ACH-000681',
                                'HSC5':'ACH-001524',
                                'HS936T':'ACH-000801',
                                'PATU8988S':'ACH-000022'})
gemini_ito_df['genepair'] = gemini_ito_df['genepair'].apply(lambda x: x.replace(';', '_'))
gemini_ito_df = pd.melt(gemini_ito_df, id_vars=['genepair'], var_name='DepMap_ID', value_name='GEMINI').sort_values(by='genepair').reset_index(drop=True)
print('number of unique gene pairs:', gemini_ito_df['genepair'].nunique())
gemini_ito_df.head()

number of unique gene pairs: 5065


Unnamed: 0,genepair,DepMap_ID,GEMINI
0,A3GALT2_ABO,ACH-000881,-0.137329
1,A3GALT2_ABO,ACH-000307,0.132501
2,A3GALT2_ABO,ACH-000915,0.083927
3,A3GALT2_ABO,ACH-001554,-0.077458
4,A3GALT2_ABO,ACH-001524,0.218901


#### Processing LFC of the datasets

In [15]:
# read GEMINI scores for Ito Screen
lfc_ito = pd.read_csv(lfc_files_path, index_col=0)

lfc_ito_df = lfc_ito.rename(columns={'gene_pair':'genepair',
                                'Meljuso':'ACH-000881', 
                                'GI1_004':'ACH-000756',
                                'MEL202_003':'ACH-001554',
                                'PK1':'ACH-000307',
                                'MEWO':'ACH-000987',
                                'HS944T':'ACH-000632',
                                'IPC298':'ACH-000915',
                                'A549':'ACH-000681',
                                'HSC5':'ACH-001524',
                                'HS936T':'ACH-000801',
                                'PATU8988S':'ACH-000022'})

lfc_ito_df = pd.melt(lfc_ito_df, id_vars=['genepair'], var_name='DepMap_ID', value_name='LFC').sort_values(by='genepair').reset_index(drop=True)
lfc_ito_df[:3]

Unnamed: 0,genepair,DepMap_ID,LFC
0,A3GALT2_AAVS1,ACH-000881,0.343378
1,A3GALT2_AAVS1,ACH-000681,0.166162
2,A3GALT2_AAVS1,ACH-000987,0.216834


In [17]:
ito_df = pd.merge(gemini_ito_df, lfc_ito_df, on=['genepair', 'DepMap_ID'], how='left')
ito_df[:3]

Unnamed: 0,genepair,DepMap_ID,GEMINI,LFC
0,A3GALT2_ABO,ACH-000881,-0.137329,0.283725
1,A3GALT2_ABO,ACH-000307,0.132501,0.201704
2,A3GALT2_ABO,ACH-000915,0.083927,-0.026881


In [19]:
# read the genenames table for mapping
id_map_raw = pd.read_table(file_path_genenames, dtype = "str")

id_map = id_map_raw[['HGNC ID', 'Approved symbol', 'NCBI Gene ID', 'NCBI Gene ID(supplied by NCBI)', 'Ensembl gene ID', 'Ensembl ID(supplied by Ensembl)', 'Previous symbols']]

# Define a function to process the previous symbols column
def process_value(value):
    if isinstance(value, str) and value != 'NA':
        if ',' in value:
            return value.split(',')
    return value
    
id_map.loc[:,'Previous symbols'] = id_map['Previous symbols'].apply(process_value)
id_map = id_map.explode('Previous symbols')
id_map['Previous symbols'] = id_map['Previous symbols'].str.strip()
id_map = id_map.reset_index(drop = True)

id_map = id_map.copy()
id_map['entrez_id'] = id_map['NCBI Gene ID(supplied by NCBI)'].combine_first(id_map['NCBI Gene ID'])
id_map['ensembl_id'] = id_map['Ensembl ID(supplied by Ensembl)'].combine_first(id_map['Ensembl gene ID'])

id_map.head()

Unnamed: 0,HGNC ID,Approved symbol,NCBI Gene ID,NCBI Gene ID(supplied by NCBI),Ensembl gene ID,Ensembl ID(supplied by Ensembl),Previous symbols,entrez_id,ensembl_id
0,HGNC:5,A1BG,1,1,ENSG00000121410,ENSG00000121410,,1,ENSG00000121410
1,HGNC:37133,A1BG-AS1,503538,503538,ENSG00000268895,ENSG00000268895,NCRNA00181,503538,ENSG00000268895
2,HGNC:37133,A1BG-AS1,503538,503538,ENSG00000268895,ENSG00000268895,A1BGAS,503538,ENSG00000268895
3,HGNC:37133,A1BG-AS1,503538,503538,ENSG00000268895,ENSG00000268895,A1BG-AS,503538,ENSG00000268895
4,HGNC:24086,A1CF,29974,29974,ENSG00000148584,ENSG00000148584,,29974,ENSG00000148584


In [None]:
""" def process_gene_symbols(gemini_df, id_map):
    # Split gene pairs into two columns
    gemini_df.insert(1, "A1", gemini_df['genepair'].apply(lambda x: x.split("_", 1)[0]))
    gemini_df.insert(2, "A2", gemini_df['genepair'].apply(lambda x: x.split("_", 1)[1]))

    # Build dictionary for mapping Approved symbols to NCBI Gene IDs
    id_map_approved = id_map.dropna(subset=['Approved symbol', 'NCBI Gene ID(supplied by NCBI)'], how='any').reset_index(drop=True)
    symbol_to_entrezid = dict(zip(id_map_approved['Approved symbol'], id_map_approved['NCBI Gene ID(supplied by NCBI)']))

    # Assign mapped NCBI Gene IDs to A1 and A2
    gemini_df = gemini_df.assign(
        A1_entrez = gemini_df['A1'].map(symbol_to_entrezid),
        A2_entrez = gemini_df['A2'].map(symbol_to_entrezid))

    # Map previous symbols to NCBI Gene IDs if the approved symbol mapping resulted in NA values
    id_map_previous = id_map.dropna(subset=['Previous symbols', 'NCBI Gene ID(supplied by NCBI)'], how='any').reset_index(drop=True)
    prev_sym_to_entrez_id = dict(zip(id_map_previous['Previous symbols'], id_map_previous['NCBI Gene ID(supplied by NCBI)']))

    gemini_df['A1_entrez'] = gemini_df['A1_entrez'].fillna(gemini_df['A1'].map(prev_sym_to_entrez_id))
    gemini_df['A2_entrez'] = gemini_df['A2_entrez'].fillna(gemini_df['A2'].map(prev_sym_to_entrez_id))

    # Drop rows with unresolved NCBI Gene IDs
    gemini_df = gemini_df.dropna(subset=['A1_entrez', 'A2_entrez'], how='any').reset_index(drop=True)
    gemini_df = gemini_df.drop('genepair', axis=1)

    # Dynamically determine value_vars: all columns except for the specified id_vars
    id_vars = ['A1', 'A2', 'A1_entrez', 'A2_entrez']
    value_vars = [col for col in gemini_df.columns if col not in id_vars]

    # Melt the DataFrame using dynamic value_vars
    gemini_df_melt = pd.melt(gemini_df, id_vars=id_vars,
                             value_vars=value_vars,
                             var_name='DepMap_ID', value_name='GEMINI_score')
    
    # Continue with renaming, remapping, and natural sorting as previously described
    gemini_df_melt.rename(columns={'A1': 'org_A1', 'A2': 'org_A2'}, inplace=True)
    id_map_approved = id_map.dropna(subset=['Approved symbol', 'NCBI Gene ID(supplied by NCBI)'], how='any').reset_index(drop=True)
    entrezid_to_symbol = dict(zip(id_map_approved['NCBI Gene ID(supplied by NCBI)'], id_map_approved['Approved symbol']))

    gemini_df_melt.insert(1, 'A1', gemini_df_melt['A1_entrez'].map(entrezid_to_symbol))
    gemini_df_melt.insert(2, 'A2', gemini_df_melt['A2_entrez'].map(entrezid_to_symbol))

    list_c = [[x, y] for x, y in zip(gemini_df_melt.A1, gemini_df_melt.A2)]
    genepairs = ['_'.join(natsorted(pair)) for pair in list_c]
    gemini_df_melt.insert(0, 'genepair', genepairs, True)

    gemini_df_melt = gemini_df_melt[['genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'DepMap_ID', 'GEMINI_score', 'org_A1', 'org_A2']]
    return gemini_df_melt """


In [20]:
def process_gene_symbols(gemini_df, id_map):
    # Split gene pairs into two columns
    gemini_df.insert(1, "A1", gemini_df['genepair'].apply(lambda x: x.split("_", 1)[0]))
    gemini_df.insert(2, "A2", gemini_df['genepair'].apply(lambda x: x.split("_", 1)[1]))

    # Build dictionary for mapping Approved symbols to NCBI Gene IDs
    id_map_approved = id_map.dropna(subset=['Approved symbol', 'entrez_id'], how='any').reset_index(drop=True)
    symbol_to_entrezid = dict(zip(id_map_approved['Approved symbol'], id_map_approved['entrez_id']))

    # Assign mapped NCBI Gene IDs to A1 and A2
    gemini_df = gemini_df.assign(
        A1_entrez = gemini_df['A1'].map(symbol_to_entrezid),
        A2_entrez = gemini_df['A2'].map(symbol_to_entrezid))

    # Map previous symbols to NCBI Gene IDs if the approved symbol mapping resulted in NA values
    id_map_previous = id_map.dropna(subset=['Previous symbols', 'entrez_id'], how='any').reset_index(drop=True)
    prev_sym_to_entrez_id = dict(zip(id_map_previous['Previous symbols'], id_map_previous['entrez_id']))

    gemini_df['A1_entrez'] = gemini_df['A1_entrez'].fillna(gemini_df['A1'].map(prev_sym_to_entrez_id))
    gemini_df['A2_entrez'] = gemini_df['A2_entrez'].fillna(gemini_df['A2'].map(prev_sym_to_entrez_id))

    # Drop rows with unresolved NCBI Gene IDs
    gemini_df = gemini_df.dropna(subset=['A1_entrez', 'A2_entrez'], how='any').reset_index(drop=True)
    gemini_df = gemini_df.drop('genepair', axis=1)

    gemini_df.rename(columns={'A1': 'org_A1', 'A2': 'org_A2'}, inplace=True)
    id_map_approved = id_map.dropna(subset=['Approved symbol', 'entrez_id'], how='any').reset_index(drop=True)
    entrezid_to_symbol = dict(zip(id_map_approved['entrez_id'], id_map_approved['Approved symbol']))

    gemini_df.insert(1, 'A1', gemini_df['A1_entrez'].map(entrezid_to_symbol))
    gemini_df.insert(2, 'A2', gemini_df['A2_entrez'].map(entrezid_to_symbol))

    list_c = [[x, y] for x, y in zip(gemini_df.A1, gemini_df.A2)]
    genepairs = ['_'.join(natsorted(pair)) for pair in list_c]
    gemini_df.insert(0, 'genepair', genepairs, True)

    gemini_df = gemini_df[['genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'DepMap_ID', 'GEMINI', 'LFC', 'org_A1', 'org_A2']]
    return gemini_df
    

In [21]:
gemini_ito_df_melt = process_gene_symbols(ito_df, id_map).sort_values(by='genepair').reset_index(drop=True)

In [22]:
# Function to sort each pair of gene symbols and their Entrez IDs
def sort_gene_pairs(row):
    # Sort the genes alphabetically and determine new order
    sorted_genes = natsorted([row['A1'], row['A2']])
    
    # Match the sorted genes to the original ones and rearrange Entrez IDs accordingly
    if sorted_genes[0] == row['A1']:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A1_entrez'], row['A2_entrez']])
    else:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A2_entrez'], row['A1_entrez']])

In [23]:
# Define the function to process the DataFrame
def process_dataframe(df):
    # Apply the sorting to each row
    df[['A1_sorted', 'A2_sorted', 'A1_entrez_sorted', 'A2_entrez_sorted']] = df.apply(sort_gene_pairs, axis=1)
    
    # Drop the old columns and rename the new ones
    df = df.drop(columns=['A1', 'A2', 'A1_entrez', 'A2_entrez']).copy()
    df = df.rename(columns={
        'A1_sorted': 'A1',
        'A2_sorted': 'A2',
        'A1_entrez_sorted': 'A1_entrez',
        'A2_entrez_sorted': 'A2_entrez'
    })
    
    return df

In [24]:
gemini_ito_df = process_dataframe(gemini_ito_df_melt)

In [25]:
display(gemini_ito_df.loc[gemini_ito_df['genepair'] == 'ADAM8_ADAM12'])

Unnamed: 0,genepair,DepMap_ID,GEMINI,LFC,org_A1,org_A2,A1,A2,A1_entrez,A2_entrez
1661,ADAM8_ADAM12,ACH-000801,0.542718,-0.401475,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1662,ADAM8_ADAM12,ACH-000881,0.138926,-0.04363,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1663,ADAM8_ADAM12,ACH-001554,0.222868,-0.064732,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1664,ADAM8_ADAM12,ACH-001524,0.372903,-0.209467,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1665,ADAM8_ADAM12,ACH-000681,0.4825,-0.51374,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1666,ADAM8_ADAM12,ACH-000632,0.298943,-0.170243,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1667,ADAM8_ADAM12,ACH-000987,0.189274,-0.063444,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1668,ADAM8_ADAM12,ACH-000756,0.041007,0.049023,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1669,ADAM8_ADAM12,ACH-000915,0.13391,0.010868,ADAM12,ADAM8,ADAM8,ADAM12,101,8038
1670,ADAM8_ADAM12,ACH-000307,0.079471,-0.104993,ADAM12,ADAM8,ADAM8,ADAM12,101,8038


In [25]:
# Read each CSV file into a pandas DataFrame and store them in a list
target_pairs_df = [ito_annotated, parrish_annotated]

In [26]:
# Dictionary to store the annotated_target_pairs DataFrames
gemini_df = [gemini_ito_df, gemini_parrish_df]

In [32]:
target_pairs_df = [ito_annotated]
gemini_df = [gemini_ito_df]

In [34]:
def merge_datasets(target_pairs_list, df_melt_list):
    merged_datasets = []
    for target_pairs, df_melt in zip(target_pairs_list, df_melt_list):
        # Make sure to drop the specified columns from df_melt before merging
        df_melt_modified = df_melt.drop(['A1', 'A2', 'org_A2', 'org_A1', 'A1_entrez', 'A2_entrez'], axis=1)
        # Perform the merge operation
        annotated_target_pairs = pd.merge(target_pairs, df_melt_modified, how='left', on=['genepair', 'DepMap_ID'])
        # Append the result to the list of merged datasets
        merged_datasets.append(annotated_target_pairs)
    return merged_datasets

In [35]:
annotated_target_pairs = merge_datasets(target_pairs_df, gemini_df)

In [29]:
annotated_target_pairs.append(klingbeil_df)

In [37]:
def label_top_10(df, lfc_threshold, label):
    
    # Drop rows where GEMINI is NaN
    df = df.dropna(subset=['GEMINI']).copy()

    # Check if the DataFrame is empty after dropping NaN values
    if df.empty:
        raise ValueError("The DataFrame is empty after dropping rows with NaN values in 'GEMINI'.")
    
    # Calculate the threshold for top 10%  
    percentile = 90
    threshold = np.percentile(df['GEMINI'], percentile)

    # Label the top 10% 
    df[label] = (df['GEMINI'] >= threshold) & (df['LFC'] < lfc_threshold)

    return df

In [None]:
# annotated_target_pairs = pd.merge(ito_annotated, gemini_ito_df[['genepair', 'DepMap_ID', 'GEMINI', 'LFC']], how='left', on=['genepair', 'DepMap_ID'])

# # ito
# ito_lfc_threshold = -0.51
# annotated_target_pairs = label_top_10(annotated_target_pairs, ito_lfc_threshold, 'SL_new')

# print('ito')
# display(annotated_target_pairs[:3])
# display(annotated_target_pairs.SL_new.value_counts())
# print('')

#os.chdir('/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/output_files')

#annotated_target_pairs.to_csv('./ito_pairs/ito_pairs_all.csv', index=False)

In [38]:
ito_lfc_threshold = -0.51
annotated_target_pairs[0] = label_top_10(annotated_target_pairs[0], ito_lfc_threshold, 'SL_new')

In [31]:
# ito
ito_lfc_threshold = -0.51
annotated_target_pairs[0] = label_top_10(annotated_target_pairs[0], ito_lfc_threshold, 'SL_new')

print('ito')
display(annotated_target_pairs[0][:3])
print('')

# parrish
parrish_lfc_threshold = -0.63
annotated_target_pairs[1] = label_top_10(annotated_target_pairs[1], parrish_lfc_threshold, 'SL_new')

print('parrish')
display(annotated_target_pairs[1][:3])
print('')

# kln
kln_lfc_threshold = -1.03
annotated_target_pairs[2] = label_top_10(annotated_target_pairs[2], kln_lfc_threshold, 'SL_new')

print('Kln')
display(annotated_target_pairs[2][:3])
print('')

ito


Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,...,A2_rank,zA2_rank,max_ranked_A1A2,min_ranked_A1A2,z_max_ranked_A1A2,z_min_ranked_A1A2,prediction_score,GEMINI,LFC,SL_new
0,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,...,,,5108.0,5108.0,-0.759628,-0.759628,0.012559,0.118768,0.088856,False
1,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000307,PK1_PANCREAS,0.986587,0.201704,False,...,,,9125.0,9125.0,0.303431,0.303431,0.012559,0.132501,0.201704,False
2,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000632,HS944T_SKIN,1.0,0.069772,False,...,,,4063.0,4063.0,-1.036177,-1.036177,0.012559,0.024593,0.069772,False



parrish


Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,GI_score,FDR,SL,...,A2_rank,zA2_rank,max_ranked_A1A2,min_ranked_A1A2,z_max_ranked_A1A2,z_min_ranked_A1A2,prediction_score,GEMINI,LFC,SL_new
0,A2M_PZP,A2M,PZP,2,5858,ACH-000779,PC9_LUNG,0.264313,0.138809,False,...,12682.0,0.65429,16883.0,12682.0,1.674342,0.65429,0.041682,-0.249934,0.333503,False
1,A2M_PZP,A2M,PZP,2,5858,ACH-001086,HELA_CERVIX,-0.15432,0.424612,False,...,16473.0,1.71615,16473.0,10395.0,1.71615,-0.314143,0.041682,-0.038689,-0.115678,False
2,AADACL3_AADACL4,AADACL3,AADACL4,126767,343066,ACH-000779,PC9_LUNG,-0.000281,0.992873,False,...,17756.0,1.523027,17756.0,16788.0,1.523027,1.267634,0.062883,0.022368,-0.106713,False



Kln


Unnamed: 0,GENE_COMBINATION,domain_combination,genepair,A1,A2,A1_entrez,A2_entrez,cell_line,DepMap_ID,GEMINI,...,A1_rank,zA1_rank,A2_rank,zA2_rank,max_ranked_A1A2,min_ranked_A1A2,z_max_ranked_A1A2,z_min_ranked_A1A2,prediction_score,SL_new
0,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,HEL,ACH-000004,0.218665,...,15404.0,3.163858,10522.0,0.250715,15404.0,10522.0,3.163858,0.250715,0.054249,False
1,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,T3M4,ACH-000085,0.205641,...,8578.0,0.800919,10295.0,0.190424,10295.0,8578.0,0.800919,0.190424,0.054249,False
2,AAK1:Kinase_domain;BMP2K:Kinase_domain,Kinase_domain_Kinase_domain,AAK1_BMP2K,AAK1,BMP2K,22848,55589,HPAFII,ACH-000094,0.044486,...,9466.0,1.108315,11125.0,0.410871,11125.0,9466.0,1.108315,0.410871,0.054249,False





In [41]:
print('ito')
display(annotated_target_pairs[0]['SL_new'].value_counts())
print('')

ito


SL_new
False    48138
True      1018
Name: count, dtype: int64




In [32]:
print('ito')
display(annotated_target_pairs[0]['SL_new'].value_counts())
print('')

print('parrish')
display(annotated_target_pairs[1]['SL_new'].value_counts())
print('')

print('Kln')
display(annotated_target_pairs[2]['SL_new'].value_counts())
print('')

ito


SL_new
False    48138
True      1018
Name: count, dtype: int64


parrish


SL_new
False    1654
True      141
Name: count, dtype: int64


Kln


SL_new
False    50878
True      2009
Name: count, dtype: int64




In [42]:
os.chdir('/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/output_files')

annotated_target_pairs[0].to_csv('ito_pairs_annotated.csv', index=False)

In [33]:
# List of names you want to use for your files
names = ['ito', 'parrish', 'klingbeil']

folder_path = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/output_files/NEW'
#folder_path = '/Users/narod/Documents/GitHub/context_specific_SL_prediction/1_feature_calculation/annotations/output_files'
#folder_path = '/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/SL PRED/input'

# Ensure the length of names matches the number of datasets you have
assert len(names) == len(annotated_target_pairs), "The number of names and datasets do not match."

# Loop through each dataset and its corresponding name to save it
for name, dataset in zip(names, annotated_target_pairs):
    filename = os.path.join(folder_path, f'{name}_pairs_annotated.csv')
    dataset.to_csv(filename, index=False)  # Saves the DataFrame to CSV without the index column