### Ito et al Screen

This notebook preprocesses CRISPR screen data from *Ito et al., 2021*  
It loads log-fold change (LFC) and false discovery rate (FDR) scores from the paper’s supplementary tables, cleans and formats them, and prepares the dataset for downstream feature annotation and training the Random Forest classifier.

**Inputs:**  
- CSV files containing LFC and FDR scores from the *Ito et al.* supplementary materials.

**Outputs:**  
- A cleaned and merged dataset saved as a CSV file, ready for feature annotation and model training.


In [1]:
# import modules
import os
import pandas as pd
import numpy as np
from natsort import natsorted

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_lfc = get_data_path(['input', 'CRISPR_screens'], 'ito_table4LFC.csv')
file_path_fdr = get_data_path(['input', 'CRISPR_screens'], 'ito_table9FDR.csv')

file_path_genenames = get_data_path(['input', 'other'], 'approved_and_previous_symbols.csv')

file_path_sample_info = get_data_path(['input', 'DepMap22Q4'], 'sample_info.csv')

file_path_processed_ito_df = get_data_path(['output', 'processed_CRISPR_screens'], 'processed_ito_df.csv')

In [3]:
#read the LFC and FDR scores from the supplementary tables in the paper

ito_lfc = pd.read_csv(file_path_lfc)
ito_fdr = pd.read_csv(file_path_fdr)

In [4]:
ito_genes = pd.concat([ito_lfc["A1"], ito_lfc["A2"]])
ito_genes = ito_genes.sort_values()
ito_genes = ito_genes.unique()
ito_genes = pd.Series(ito_genes, name="Genes")
print(len(ito_genes))

3285


In [5]:
# read the gene names mapping file
id_map = pd.read_csv(file_path_genenames)

# create dictionaries to map gene symbols to Entrez IDs
approved_sym_to_entrez_id = dict(zip(id_map['Approved symbol'], id_map['entrez_id']))
prev_sym_to_entrez_id = dict(zip(id_map['Previous symbol'], id_map['entrez_id']))

In [6]:
ito_genes_df = pd.DataFrame(ito_genes)
ito_genes_df['entrez_id'] = ito_genes_df['Genes'].map(approved_sym_to_entrez_id)
ito_genes_df['entrez_id'] = ito_genes_df['entrez_id'].fillna(ito_genes_df['Genes'].map(prev_sym_to_entrez_id))
ito_genes_df = ito_genes_df.dropna(subset=['entrez_id']).reset_index(drop=True)
ito_genes_df['entrez_id'] = ito_genes_df['entrez_id'].astype(int)

In [7]:
ito_genes_df_dict = dict(zip(ito_genes_df['Genes'], ito_genes_df['entrez_id']))

### Map Gene Symbols to Entrez IDs

**Process FDR Data:**
- Map original gene symbols to Entrez IDs using the combined mapping dictionary
- Convert Entrez IDs back to approved gene symbols for consistency
- Remove rows where gene symbols cannot be mapped to Entrez IDs

In [8]:
ito_fdr.rename(columns={'A1': 'org_A1', 'A2':'org_A2'}, inplace=True)

In [9]:
# Map gene symbols to Entrez IDs, unmapped values will be NA
ito_fdr.insert(0, 'A1_entrez', ito_fdr['org_A1'].map(ito_genes_df_dict))
ito_fdr.insert(1, 'A2_entrez', ito_fdr['org_A2'].map(ito_genes_df_dict))

In [10]:
entrez_id_to_approved_sym = dict(zip(id_map['entrez_id'], id_map['Approved symbol']))
ito_fdr.insert(2, 'A1', ito_fdr['A1_entrez'].map(entrez_id_to_approved_sym))
ito_fdr.insert(3, 'A2', ito_fdr['A2_entrez'].map(entrez_id_to_approved_sym))

In [11]:
print('# check the NA values in A1_entrez & A2_entrez')
display(ito_fdr.loc[ito_fdr['A1_entrez'].isna(), ])
display(ito_fdr.loc[ito_fdr['A2_entrez'].isna(), ])

# check the NA values in A1_entrez & A2_entrez


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
268,,445815.0,,PALM2AKAP2,AKAP2,PALM2-AKAP2,0.956402,0.999586,0.997729,1.0,0.801651,1.0,0.999989,0.86235,0.995681,0.976099,1.0


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
979,875.0,,CBS,,CBS,CBSL,0.978955,0.999586,0.997729,1.0,0.920903,1.0,0.999989,0.999995,0.995681,0.821062,1.0


In [12]:
#manually map the missing Entrez IDs of CBSL as CBS_CBSL is in the list of paralog pairs
ito_fdr.loc[979, 'A2_entrez'] = 102724560
ito_fdr.loc[979, 'A2'] = 'CBSL'

In [13]:
ito_fdr = ito_fdr.dropna(subset=['A1_entrez', 'A2_entrez'], how='any')
ito_fdr = ito_fdr.sort_values(by=['A1', 'A2'])
ito_fdr = ito_fdr.reset_index(drop=True)

print('# check the NA values in A1_entrez & A2_entrez')
display(ito_fdr.loc[ito_fdr['A1_entrez'].isna(), ])
display(ito_fdr.loc[ito_fdr['A2_entrez'].isna(), ])

# check the NA values in A1_entrez & A2_entrez


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


### Process LFC Data

**Apply same gene symbol mapping to LFC data:**

In [16]:
ito_lfc.rename(columns={'A1': 'org_A1', 'A2':'org_A2'}, inplace=True)

In [17]:
# Map gene symbols to Entrez IDs, unmapped values will be NA
ito_lfc.insert(0, 'A1_entrez', ito_lfc['org_A1'].map(ito_genes_df_dict))
ito_lfc.insert(1, 'A2_entrez', ito_lfc['org_A2'].map(ito_genes_df_dict))

# Map Entrez IDs back to approved gene symbols
ito_lfc.insert(2, 'A1', ito_lfc['A1_entrez'].map(entrez_id_to_approved_sym))
ito_lfc.insert(3, 'A2', ito_lfc['A2_entrez'].map(entrez_id_to_approved_sym)) 

In [18]:
print('# check the NA values in A1_entrez & A2_entrez')
display(ito_lfc.loc[ito_lfc['A1_entrez'].isna(), ])
display(ito_lfc.loc[ito_lfc['A2_entrez'].isna(), ])

# check the NA values in A1_entrez & A2_entrez


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
3552,,445815.0,,PALM2AKAP2,AKAP2,PALM2-AKAP2,0.109867,0.182215,0.240199,0.400299,-0.142461,0.269418,0.249554,-0.067029,-0.032087,-0.065899,0.09491


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
160,17.0,,AAVS1,,AAVS1,AKAP2,-0.003264,0.290882,0.378924,0.307161,0.139761,0.551639,0.163764,-0.000885,0.149869,0.30075,-0.048981
504,17.0,,AAVS1,,AAVS1,CBSL,0.35614,0.605041,0.523035,0.362285,0.426894,0.49858,0.311326,0.287057,0.286092,0.339218,0.343742
4274,875.0,,CBS,,CBS,CBSL,0.442627,0.232521,0.460553,0.195316,0.161555,0.299819,0.186392,0.339201,0.169406,0.025634,0.421703


In [19]:
#manually map the missing Entrez IDs of CBSL as CBS_CBSL is in the list of paralog pairs
ito_lfc.loc[504, 'A2_entrez'] = 102724560
ito_lfc.loc[504, 'A2'] = 'CBSL'

ito_lfc.loc[4274, 'A2_entrez'] = 102724560
ito_lfc.loc[4274, 'A2'] = 'CBSL'

In [20]:
ito_lfc = ito_lfc.dropna(subset=['A1_entrez', 'A2_entrez'], how='any')
ito_lfc = ito_lfc.sort_values(by=['A1', 'A2'])
ito_lfc = ito_lfc.reset_index(drop=True)

print('# check the NA values in A1_entrez & A2_entrez')
display(ito_lfc.loc[ito_lfc['A1_entrez'].isna(), ])
display(ito_lfc.loc[ito_lfc['A2_entrez'].isna(), ])

# check the NA values in A1_entrez & A2_entrez


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


Unnamed: 0,A1_entrez,A2_entrez,A1,A2,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


**Melt the data from wide to long format and create sorted gene pairs:**

In [21]:
ito_fdr_melt = pd.melt(ito_fdr, id_vars=['A1', 'A2', 'A1_entrez', 'A2_entrez', 'org_A1', 'org_A2'], 
                       value_vars=ito_fdr.columns[6:], 
                       var_name='cell_line', value_name='FDR')

In [22]:
ito_lfc_melt = pd.melt(ito_lfc, id_vars=['A1', 'A2', 'A1_entrez', 'A2_entrez', 'org_A1', 'org_A2'], 
                       value_vars=ito_lfc.columns[6:], 
                       var_name='cell_line', value_name='LFC')

In [23]:
# natural sort genepairs

list_c = [[x, y] for x, y in zip(ito_fdr_melt.A1, ito_fdr_melt.A2)]

genepairs = []
for pair in list_c:
    sorted_pair = natsorted(pair)
    genepairs.append(sorted_pair)

m = []
for i in range(0 , len(genepairs)):
    a = '_'.join(genepairs[i])
    m.append(a)

ito_fdr_melt.insert(0, 'genepair', m, True)

In [24]:
list_c = [[x, y] for x, y in zip(ito_lfc_melt.A1, ito_lfc_melt.A2)]

genepairs = []
for pair in list_c:
    sorted_pair = natsorted(pair)
    genepairs.append(sorted_pair)

m = []
for i in range(0 , len(genepairs)):
    a = '_'.join(genepairs[i])
    m.append(a)

ito_lfc_melt.insert(0, 'genepair', m, True)

In [36]:
ito_pairs_df = pd.merge(ito_fdr_melt, ito_lfc_melt, how='left')
ito_pairs_df = ito_pairs_df.sort_values(by=['genepair', 'cell_line'])
ito_pairs_df = ito_pairs_df.reset_index(drop=True)
print(f'number of unique genepairs: {ito_pairs_df["genepair"].nunique()}')
ito_pairs_df.head()

number of unique genepairs: 4858


Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,cell_line,FDR,LFC
0,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,A3GALT2,ABO,A549_LUNG,0.977988,0.379455
1,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,A3GALT2,ABO,GI1_CENTRAL_NERVOUS_SYSTEM,0.999586,-0.077118
2,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,A3GALT2,ABO,HS936T_SKIN,0.943776,0.020272
3,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,A3GALT2,ABO,HS944T_SKIN,1.0,0.069772
4,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,A3GALT2,ABO,HSC5_SKIN,0.748567,0.182706


### Final Data Processing

**Sort gene pairs consistently and add synthetic lethality labels:**

In [37]:
# Function to sort each pair of gene symbols and their Entrez IDs
def sort_entrez_ids(row):
    # Sort the genes alphabetically and determine new order
    sorted_genes = natsorted([row['A1'], row['A2']])
    
    # Match the sorted genes to the original ones and rearrange Entrez IDs accordingly
    if sorted_genes[0] == row['A1']:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A1_entrez'], row['A2_entrez']])
    else:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A2_entrez'], row['A1_entrez']])

# Apply the sorting to each row
df = ito_pairs_df.copy()
df[['A1_sorted', 'A2_sorted', 'A1_entrez_sorted', 'A2_entrez_sorted']] = df.apply(sort_entrez_ids, axis=1)

# Drop the old columns and rename the new ones
ito_pairs_df_new = df.drop(columns=['A1', 'A2', 'A1_entrez', 'A2_entrez']).copy()
ito_pairs_df_new = ito_pairs_df_new.rename(columns={
    'A1_sorted': 'A1',
    'A2_sorted': 'A2',
    'A1_entrez_sorted': 'A1_entrez',
    'A2_entrez_sorted': 'A2_entrez'
})

In [38]:
ito_pairs_df_new.head()

Unnamed: 0,genepair,org_A1,org_A2,cell_line,FDR,LFC,A1,A2,A1_entrez,A2_entrez
0,A3GALT2_ABO,A3GALT2,ABO,A549_LUNG,0.977988,0.379455,A3GALT2,ABO,127550.0,28.0
1,A3GALT2_ABO,A3GALT2,ABO,GI1_CENTRAL_NERVOUS_SYSTEM,0.999586,-0.077118,A3GALT2,ABO,127550.0,28.0
2,A3GALT2_ABO,A3GALT2,ABO,HS936T_SKIN,0.943776,0.020272,A3GALT2,ABO,127550.0,28.0
3,A3GALT2_ABO,A3GALT2,ABO,HS944T_SKIN,1.0,0.069772,A3GALT2,ABO,127550.0,28.0
4,A3GALT2_ABO,A3GALT2,ABO,HSC5_SKIN,0.748567,0.182706,A3GALT2,ABO,127550.0,28.0


In [41]:
# FDR cut off is from Ito et al paper
SL_condition = (ito_pairs_df_new['FDR'] < 0.05)
ito_pairs_df_new['SL'] = SL_condition

In [42]:
ito_pairs_df_new['SL'].value_counts()

SL
False    52261
True      1177
Name: count, dtype: int64

In [None]:
# in case you need to add ensembl id to your dataset

""" id_map_notna = hgnc.dropna(subset=['Ensembl gene ID', 'NCBI Gene ID(supplied by NCBI)'], how='any')

entrez_id_to_ensembl = dict(zip(id_map_notna['NCBI Gene ID(supplied by NCBI)'], id_map_notna['Ensembl ID(supplied by Ensembl)']))

ito_pairs_df = ito_pairs_df.assign(
    A1_ensembl = ito_pairs_df['A1_entrez'].map(entrez_id_to_ensembl),
    A2_ensembl = ito_pairs_df['A2_entrez'].map(entrez_id_to_ensembl))

ito_pairs_df.loc[ito_pairs_df['A2_ensembl'].isna(), 'A2'].unique() """

### Add Cell Line Mapping

**Map cell line names to DepMap IDs:**

In [43]:
sample_info = pd.read_csv(file_path_sample_info)
sample_info[:2]

Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,...,primary_disease,Subtype,age,Sanger_Model_ID,depmap_public_comments,lineage,lineage_subtype,lineage_sub_subtype,lineage_molecular_subtype,culture_type
0,ACH-000001,NIH:OVCAR-3,NIHOVCAR3,NIHOVCAR3_OVARY,OVCAR3,905933.0,Female,ATCC,CVCL_0465,2201.0,...,Ovarian Cancer,"Adenocarcinoma, high grade serous",60.0,SIDM00105,,ovary,ovary_adenocarcinoma,high_grade_serous,,Adherent
1,ACH-000002,HL-60,HL60,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,905938.0,Female,ATCC,CVCL_0002,55.0,...,Leukemia,"Acute Myelogenous Leukemia (AML), M3 (Promyelo...",35.0,SIDM00829,,blood,AML,M3,,Suspension


In [44]:
CCLE_name_to_DepMapID = dict(zip(sample_info.CCLE_Name, sample_info.DepMap_ID))
ito_pairs_df_new.insert(7, "DepMap_ID", ito_pairs_df_new["cell_line"].map(CCLE_name_to_DepMapID))

### Save Processed Data

**Final formatting and export:**

In [45]:
ito_pairs_df_new = ito_pairs_df_new.rename(columns={'FDR': 'Gemini_FDR', 'LFC': 'raw_LFC'})
ito_pairs_df_new = ito_pairs_df_new[['genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'DepMap_ID', 'cell_line', 'Gemini_FDR', 'raw_LFC', 'SL', 'org_A1', 'org_A2']]
ito_pairs_df_new = ito_pairs_df_new.sort_values(by=['genepair'], ascending=True).reset_index(drop=True)

In [46]:
ito_pairs_df_new[:3]

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,org_A1,org_A2
0,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000681,A549_LUNG,0.977988,0.379455,False,A3GALT2,ABO
1,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000756,GI1_CENTRAL_NERVOUS_SYSTEM,0.999586,-0.077118,False,A3GALT2,ABO
2,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000801,HS936T_SKIN,0.943776,0.020272,False,A3GALT2,ABO


In [50]:
ito_pairs_df_new.to_csv(file_path_processed_ito_df, index = False)