### Ito et al Screen

This notebook preprocesses CRISPR screen data from *Ito et al., 2021*  
It loads log-fold change (LFC) and false discovery rate (FDR) scores from the paper’s supplementary tables, cleans and formats them, and prepares the dataset for downstream feature annotation and training the Random Forest classifier.

**Inputs:**  
- CSV files containing LFC and FDR scores from the *Ito et al.* supplementary materials.

**Outputs:**  
- A cleaned and merged dataset saved as a CSV file, ready for feature annotation and model training.


In [10]:
# import modules
import os
import pandas as pd
import numpy as np
from natsort import natsorted

In [11]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_lfc = get_data_path(['input', 'CRISPR_screens'], 'ito_table4LFC.csv')
file_path_fdr = get_data_path(['input', 'CRISPR_screens'], 'ito_table9FDR.csv')

file_path_genenames = get_data_path(['input', 'other'], 'genenames.txt')

file_path_sample_info = get_data_path(['input', 'DepMap22Q4'], 'sample_info.csv')

file_path_processed_ito_df = get_data_path(['output', 'processed_CRISPR_screens'], 'processed_ito_df.csv')

In [12]:
#read the LFC and FDR scores from the supplementary tables in the paper

ito_lfc = pd.read_csv(file_path_lfc)
ito_fdr = pd.read_csv(file_path_fdr)

In [13]:
ito_genes = pd.concat([ito_fdr["A1"], ito_fdr["A2"]])
ito_genes = ito_genes.sort_values()
ito_genes = ito_genes.unique()
ito_genes = pd.Series(ito_genes, name="Genes")
print(len(ito_genes))

3217


In [14]:
# read the genenames table for mapping
hgnc = pd.read_table(file_path_genenames, dtype = "str")

id_map = hgnc[['HGNC ID', 'Approved symbol', 'NCBI Gene ID(supplied by NCBI)', 'Ensembl ID(supplied by Ensembl)', 'Previous symbols']]

# Define a function to process the previous symbols column
def process_value(value):
    if isinstance(value, str) and value != 'NA':
        if ',' in value:
            return value.split(',')
    return value
    
id_map.loc[:,'Previous symbols'] = id_map['Previous symbols'].apply(process_value)
id_map = id_map.explode('Previous symbols')
id_map['Previous symbols'] = id_map['Previous symbols'].str.strip()
id_map = id_map.reset_index(drop = True)
id_map.head()

Unnamed: 0,HGNC ID,Approved symbol,NCBI Gene ID(supplied by NCBI),Ensembl ID(supplied by Ensembl),Previous symbols
0,HGNC:5,A1BG,1,ENSG00000121410,
1,HGNC:37133,A1BG-AS1,503538,ENSG00000268895,NCRNA00181
2,HGNC:37133,A1BG-AS1,503538,ENSG00000268895,A1BGAS
3,HGNC:37133,A1BG-AS1,503538,ENSG00000268895,A1BG-AS
4,HGNC:24086,A1CF,29974,ENSG00000148584,


In [15]:
ito_approved_symbols = ito_genes[ito_genes.isin(id_map['Approved symbol'])]
print(f'Ito screen has {len(ito_approved_symbols)} Approved HGNC symbols / {len(ito_genes)} total genes')

Ito screen has 3159 Approved HGNC symbols / 3217 total genes


In [16]:
id_map_notna = id_map.loc[id_map['NCBI Gene ID(supplied by NCBI)'].notna(), ]

approved_sym_to_entrez_id = dict(zip(id_map_notna['Approved symbol'], id_map_notna['NCBI Gene ID(supplied by NCBI)']))

ito_approved_symbols_df = pd.DataFrame(ito_approved_symbols).rename(columns={'Genes':'Approved symbol'}).reset_index(drop=True)
ito_approved_symbols_df['entrez_id'] = ito_approved_symbols_df['Approved symbol'].map(approved_sym_to_entrez_id)

print(f'Ito screen has {ito_approved_symbols_df.shape[0]} Approved HGNC symbols')
ito_approved_symbols_df.head()

Ito screen has 3159 Approved HGNC symbols


Unnamed: 0,Approved symbol,entrez_id
0,A3GALT2,127550
1,AADAC,13
2,AADACL2,344752
3,AADACL3,126767
4,AADACL4,343066


In [17]:
ito_approved_symbols_df.loc[ito_approved_symbols_df['entrez_id'].isna()]
# manually checked AKAP2 & CBSL, they don't have entrez id on HGNC portal

Unnamed: 0,Approved symbol,entrez_id
144,AKAP2,
473,CBSL,


In [18]:
ito_approved_symbols_df = ito_approved_symbols_df.loc[ito_approved_symbols_df['entrez_id'].notna(), ]
print(f'Ito screen has {ito_approved_symbols_df.shape[0]} entrezd id for approved symbols')
ito_approved_symbols_df.head()

Ito screen has 3157 entrezd id for approved symbols


Unnamed: 0,Approved symbol,entrez_id
0,A3GALT2,127550
1,AADAC,13
2,AADACL2,344752
3,AADACL3,126767
4,AADACL4,343066


### Find out missing genes

In [19]:
missing_ito_genes = ito_genes[~ito_genes.isin(id_map['Approved symbol'])]
print('# of missing genes:', len(missing_ito_genes))

# of missing genes: 58


In [20]:
ito_previous_symbols = missing_ito_genes[missing_ito_genes.isin(id_map['Previous symbols'])]
ito_previous_symbols_df = pd.DataFrame(ito_previous_symbols).rename(columns={'Genes':'Previous symbols'})
ito_previous_symbols_df = pd.merge(ito_previous_symbols_df, id_map[['Approved symbol', 'Previous symbols', 'NCBI Gene ID(supplied by NCBI)']], on='Previous symbols')
ito_previous_symbols_df = ito_previous_symbols_df.rename(columns={'NCBI Gene ID(supplied by NCBI)':'entrez_id'})

print('# of unique genes in ito screen w Previous HGNC symbol:', ito_previous_symbols_df['Previous symbols'].nunique())
ito_previous_symbols_df.head()

# of unique genes in ito screen w Previous HGNC symbol: 58


Unnamed: 0,Previous symbols,Approved symbol,entrez_id
0,ACPP,ACP3,55
1,ADSS,ADSS2,159
2,ADSSL1,ADSS1,122622
3,AES,TLE5,166
4,ALG1L,ALG1L1P,200810


In [21]:
ito_previous_symbols_df.loc[ito_previous_symbols_df['entrez_id'].isna()]

Unnamed: 0,Previous symbols,Approved symbol,entrez_id


In [22]:
ito_symbols_df = pd.concat([ito_approved_symbols_df, ito_previous_symbols_df], axis=0, ignore_index=True)
ito_symbols_df = ito_symbols_df.sort_values(by=['Approved symbol']).reset_index(drop=True)

print(f'Ito screen has {ito_symbols_df.shape[0]} entrez ids for Approved HGNC symbols / {len(ito_genes)} total genes')
display(ito_symbols_df.loc[ito_symbols_df['Approved symbol'].isna()])
display(ito_symbols_df.head())

Ito screen has 3215 entrez ids for Approved HGNC symbols / 3217 total genes


Unnamed: 0,Approved symbol,entrez_id,Previous symbols


Unnamed: 0,Approved symbol,entrez_id,Previous symbols
0,A3GALT2,127550,
1,AADAC,13,
2,AADACL2,344752,
3,AADACL3,126767,
4,AADACL4,343066,


### Remove/Add updated symbols

- using prev_to_approved_sym dictionary update gene symbol to approved symbols
- add entrez_id of approved symbols
- remove gene symbols that are present in non_protein_coding_genes

In [23]:
ito_fdr.rename(columns={'A1': 'org_A1', 'A2':'org_A2'}, inplace=True)

In [24]:
ito_fdr.insert(0, 'A1', ito_fdr['org_A1'])
ito_fdr.insert(1, 'A2', ito_fdr['org_A2'])

prev_to_approved_sym = dict(zip(ito_previous_symbols_df['Previous symbols'], ito_previous_symbols_df['Approved symbol']))
ito_fdr['A1'] = ito_fdr['A1'].replace(prev_to_approved_sym) 
ito_fdr['A2'] = ito_fdr['A2'].replace(prev_to_approved_sym) 

In [25]:
approved_sym_to_entrez_id = dict(zip(ito_symbols_df['Approved symbol'], ito_symbols_df['entrez_id']))
ito_fdr.insert(2, 'A1_entrez', ito_fdr['A1'].map(approved_sym_to_entrez_id))
ito_fdr.insert(3, 'A2_entrez', ito_fdr['A2'].map(approved_sym_to_entrez_id))

In [26]:
print('# check the NA values in A1_new & A2_new')
display(ito_fdr.loc[ito_fdr['A1_entrez'].isna(), ])
display(ito_fdr.loc[ito_fdr['A2_entrez'].isna(), ])

# check the NA values in A1_new & A2_new


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
268,AKAP2,PALM2AKAP2,,445815,AKAP2,PALM2-AKAP2,0.956402,0.999586,0.997729,1.0,0.801651,1.0,0.999989,0.86235,0.995681,0.976099,1.0


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
979,CBS,CBSL,875,,CBS,CBSL,0.978955,0.999586,0.997729,1.0,0.920903,1.0,0.999989,0.999995,0.995681,0.821062,1.0


In [27]:
ito_fdr = ito_fdr.dropna(subset=['A1_entrez', 'A2_entrez'], how='any')
ito_fdr = ito_fdr.reset_index(drop=True)

print('# check the NA values in A1_new & A2_new')
display(ito_fdr.loc[ito_fdr['A1_entrez'].isna(), ])
display(ito_fdr.loc[ito_fdr['A2_entrez'].isna(), ])

# check the NA values in A1_new & A2_new


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


In [28]:
ito_lfc.rename(columns={'A1': 'org_A1', 'A2':'org_A2'}, inplace=True)

In [29]:
ito_lfc.insert(0, 'A1', ito_lfc['org_A1'])
ito_lfc.insert(1, 'A2', ito_lfc['org_A2'])

ito_lfc['A1'] = ito_lfc['A1'].replace(prev_to_approved_sym) 
ito_lfc['A2'] = ito_lfc['A2'].replace(prev_to_approved_sym) 

In [30]:
ito_lfc.insert(2, 'A1_entrez', ito_lfc['A1'].map(approved_sym_to_entrez_id))
ito_lfc.insert(3, 'A2_entrez', ito_lfc['A2'].map(approved_sym_to_entrez_id))

In [31]:
ito_lfc.head()

Unnamed: 0,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
0,A3GALT2,AAVS1,127550,,A3GALT2,AAVS1,0.166162,0.295353,0.39818,0.29188,0.35181,0.229996,0.351409,0.343378,0.216834,0.280086,0.445479
1,A3GALT2,ABO,127550,28.0,A3GALT2,ABO,0.379455,-0.077118,0.020272,0.069772,0.182706,-0.026881,0.274489,0.283725,0.129078,0.088856,0.201704
2,A3GALT2,GBGT1,127550,26301.0,A3GALT2,GBGT1,0.210534,-0.119984,-0.171315,-0.087374,-0.259095,-0.297514,0.221781,0.123985,0.039808,-0.04647,-0.077573
3,A3GALT2,GLT6D1,127550,360203.0,A3GALT2,GLT6D1,0.099146,-0.134601,-0.009195,0.001207,-0.249748,-0.102567,0.288113,0.187406,0.015766,0.022604,-0.022443
4,AADAC,AADACL2,13,344752.0,AADAC,AADACL2,-0.011274,0.116829,-0.093075,0.021654,0.022936,0.267473,0.115528,-0.090458,-0.097705,0.053806,0.060962


In [32]:
ito_lfc = ito_lfc.dropna(subset=['A1_entrez', 'A2_entrez'], how='any')
ito_lfc = ito_lfc.reset_index(drop=True)

print('# check the NA values in A1_new & A2_new')
display(ito_lfc.loc[ito_lfc['A1_entrez'].isna(), ])
display(ito_lfc.loc[ito_lfc['A2_entrez'].isna(), ])

# check the NA values in A1_new & A2_new


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS


In [33]:
ito_fdr_melt = pd.melt(ito_fdr, id_vars=['A1', 'A2', 'A1_entrez', 'A2_entrez', 'org_A1', 'org_A2'], 
                       value_vars=ito_fdr.columns[6:], 
                       var_name='cell_line', value_name='FDR')

In [34]:
ito_lfc_melt = pd.melt(ito_lfc, id_vars=['A1', 'A2', 'A1_entrez', 'A2_entrez', 'org_A1', 'org_A2'], 
                       value_vars=ito_lfc.columns[6:], 
                       var_name='cell_line', value_name='LFC')

In [None]:
# natural sort genepairs

list_c = [[x, y] for x, y in zip(ito_fdr_melt.A1, ito_fdr_melt.A2)]

genepairs = []
for pair in list_c:
    sorted_pair = natsorted(pair)
    genepairs.append(sorted_pair)

m = []
for i in range(0 , len(genepairs)):
    a = '_'.join(genepairs[i])
    m.append(a)

ito_fdr_melt.insert(0, 'genepair', m, True)

In [27]:
list_c = [[x, y] for x, y in zip(ito_lfc_melt.A1, ito_lfc_melt.A2)]

genepairs = []
for pair in list_c:
    sorted_pair = natsorted(pair)
    genepairs.append(sorted_pair)

m = []
for i in range(0 , len(genepairs)):
    a = '_'.join(genepairs[i])
    m.append(a)

ito_lfc_melt.insert(0, 'genepair', m, True)

In [28]:
ito_pairs_df = pd.merge(ito_fdr_melt, ito_lfc_melt, how='left')
ito_pairs_df.head()

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,org_A1,org_A2,cell_line,FDR,LFC
0,A3GALT2_ABO,A3GALT2,ABO,127550,28,A3GALT2,ABO,A549_LUNG,0.977988,0.379455
1,A3GALT2_GBGT1,A3GALT2,GBGT1,127550,26301,A3GALT2,GBGT1,A549_LUNG,0.96892,0.210534
2,A3GALT2_GLT6D1,A3GALT2,GLT6D1,127550,360203,A3GALT2,GLT6D1,A549_LUNG,0.919924,0.099146
3,AADAC_AADACL2,AADAC,AADACL2,13,344752,AADAC,AADACL2,A549_LUNG,0.627623,-0.011274
4,AADAC_AADACL3,AADAC,AADACL3,13,126767,AADAC,AADACL3,A549_LUNG,0.743075,0.018253


In [None]:
# Function to sort each pair of gene symbols and their Entrez IDs
def sort_entrez_ids(row):
    # Sort the genes alphabetically and determine new order
    sorted_genes = natsorted([row['A1'], row['A2']])
    
    # Match the sorted genes to the original ones and rearrange Entrez IDs accordingly
    if sorted_genes[0] == row['A1']:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A1_entrez'], row['A2_entrez']])
    else:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A2_entrez'], row['A1_entrez']])

# Apply the sorting to each row
df = ito_pairs_df.copy()
df[['A1_sorted', 'A2_sorted', 'A1_entrez_sorted', 'A2_entrez_sorted']] = df.apply(sort_entrez_ids, axis=1)

# Drop the old columns and rename the new ones
ito_pairs_df_new = df.drop(columns=['A1', 'A2', 'A1_entrez', 'A2_entrez']).copy()
ito_pairs_df_new = ito_pairs_df_new.rename(columns={
    'A1_sorted': 'A1',
    'A2_sorted': 'A2',
    'A1_entrez_sorted': 'A1_entrez',
    'A2_entrez_sorted': 'A2_entrez'
})

In [30]:
ito_pairs_df_new.head()

Unnamed: 0,genepair,org_A1,org_A2,cell_line,FDR,LFC,A1,A2,A1_entrez,A2_entrez
0,A3GALT2_ABO,A3GALT2,ABO,A549_LUNG,0.977988,0.379455,A3GALT2,ABO,127550,28
1,A3GALT2_GBGT1,A3GALT2,GBGT1,A549_LUNG,0.96892,0.210534,A3GALT2,GBGT1,127550,26301
2,A3GALT2_GLT6D1,A3GALT2,GLT6D1,A549_LUNG,0.919924,0.099146,A3GALT2,GLT6D1,127550,360203
3,AADAC_AADACL2,AADAC,AADACL2,A549_LUNG,0.627623,-0.011274,AADAC,AADACL2,13,344752
4,AADAC_AADACL3,AADAC,AADACL3,A549_LUNG,0.743075,0.018253,AADAC,AADACL3,13,126767


In [33]:
# FDR cut off is from Ito et al paper
SL_condition = (ito_pairs_df_new['FDR'] < 0.05)
ito_pairs_df_new['SL'] = SL_condition

In [34]:
ito_pairs_df_new['SL'].value_counts()

SL
False    52250
True      1177
Name: count, dtype: int64

In [None]:
# in case you need to add ensembl id to your dataset

""" id_map_notna = hgnc.dropna(subset=['Ensembl gene ID', 'NCBI Gene ID(supplied by NCBI)'], how='any')

entrez_id_to_ensembl = dict(zip(id_map_notna['NCBI Gene ID(supplied by NCBI)'], id_map_notna['Ensembl ID(supplied by Ensembl)']))

ito_pairs_df = ito_pairs_df.assign(
    A1_ensembl = ito_pairs_df['A1_entrez'].map(entrez_id_to_ensembl),
    A2_ensembl = ito_pairs_df['A2_entrez'].map(entrez_id_to_ensembl))

ito_pairs_df.loc[ito_pairs_df['A2_ensembl'].isna(), 'A2'].unique() """

In [None]:
sample_info = pd.read_csv(file_path_sample_info)
sample_info[:2]

Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,...,primary_disease,Subtype,age,Sanger_Model_ID,depmap_public_comments,lineage,lineage_subtype,lineage_sub_subtype,lineage_molecular_subtype,culture_type
0,ACH-000001,NIH:OVCAR-3,NIHOVCAR3,NIHOVCAR3_OVARY,OVCAR3,905933.0,Female,ATCC,CVCL_0465,2201.0,...,Ovarian Cancer,"Adenocarcinoma, high grade serous",60.0,SIDM00105,,ovary,ovary_adenocarcinoma,high_grade_serous,,Adherent
1,ACH-000002,HL-60,HL60,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,905938.0,Female,ATCC,CVCL_0002,55.0,...,Leukemia,"Acute Myelogenous Leukemia (AML), M3 (Promyelo...",35.0,SIDM00829,,blood,AML,M3,,Suspension


In [None]:
CCLE_name_to_DepMapID = dict(zip(sample_info.CCLE_Name, sample_info.DepMap_ID))
ito_pairs_df_new.insert(7, "DepMap_ID", ito_pairs_df_new["cell_line"].map(CCLE_name_to_DepMapID))

In [38]:
ito_pairs_df_new = ito_pairs_df_new.rename(columns={'FDR': 'Gemini_FDR', 'LFC': 'raw_LFC'})
ito_pairs_df_new = ito_pairs_df_new[['genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'DepMap_ID', 'cell_line', 'Gemini_FDR', 'raw_LFC', 'SL', 'org_A1', 'org_A2']]
ito_pairs_df_new = ito_pairs_df_new.sort_values(by=['genepair'], ascending=True).reset_index(drop=True)

In [39]:
ito_pairs_df_new[:3]

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,org_A1,org_A2
0,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000681,A549_LUNG,0.977988,0.379455,False,A3GALT2,ABO
1,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000987,MEWO_SKIN,0.995681,0.129078,False,A3GALT2,ABO
2,A3GALT2_ABO,A3GALT2,ABO,127550,28,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,A3GALT2,ABO


In [40]:
ito_pairs_df_new.loc[ito_pairs_df_new['Gemini_FDR'].isna(),]

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,org_A1,org_A2
133,ABCF1_ABCF3,ABCF1,ABCF3,23,55324,ACH-000307,PK1_PANCREAS,,-1.141264,False,ABCF1,ABCF3
137,ABCF1_ABCF3,ABCF1,ABCF3,23,55324,ACH-000801,HS936T_SKIN,,-1.347850,False,ABCF1,ABCF3
337,ACAA2_ACAT2,ACAA2,ACAT2,10449,39,ACH-000915,IPC298_SKIN,,-1.230908,False,ACAA2,ACAT2
354,ACACA_ACACB,ACACA,ACACB,31,32,ACH-000881,MELJUSO_SKIN,,-1.027091,False,ACACA,ACACB
355,ACACA_ACACB,ACACA,ACACB,31,32,ACH-000681,A549_LUNG,,-1.200284,False,ACACA,ACACB
...,...,...,...,...,...,...,...,...,...,...,...,...
52880,WNK1_WNK2,WNK1,WNK2,65125,65268,ACH-000307,PK1_PANCREAS,,-0.870859,False,WNK1,WNK2
52892,WNK1_WNK3,WNK1,WNK3,65125,65267,ACH-000307,PK1_PANCREAS,,-1.181387,False,WNK1,WNK3
52905,WNK1_WNK4,WNK1,WNK4,65125,65266,ACH-000307,PK1_PANCREAS,,-0.949150,False,WNK1,WNK4
53007,YTHDF1_YTHDF2,YTHDF1,YTHDF2,54915,51441,ACH-000915,IPC298_SKIN,,-1.310589,False,YTHDF1,YTHDF2


In [41]:
ito_pairs_df_new.isna().sum()

genepair         0
A1               0
A2               0
A1_entrez        0
A2_entrez        0
DepMap_ID        0
cell_line        0
Gemini_FDR    1288
raw_LFC          0
SL               0
org_A1           0
org_A2           0
dtype: int64

In [42]:
ito_pairs_df_new.genepair.nunique()

4857

In [43]:
ito_pairs_df_new.to_csv(file_path_processed_ito_df, index = False)