## Parrish *et al.* screen

This notebook preprocesses CRISPR screen data from *Parrish et al., 2021* 

**Table S4** https://ars.els-cdn.com/content/image/1-s2.0-S2211124721010354-mmc5.xlsx

**paper** https://doi.org/10.1016/j.celrep.2021.109597

The scaled and normalized LFC for each pgRNA was termed a CRISPR score (CS). Target-level CRISPR scores were calculated by taking the mean across pgRNAs with the same single KO or DKO paralog target. Final CRISPR scores were computed by taking the mean across the three biological replicates for each screen.

In [1]:
# import modules
import os
import pandas as pd
import numpy as np
from natsort import natsorted

In [2]:
# set the base directory for the project
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_gi_and_fdr = get_data_path(['input', 'CRISPR_screens'], 'mmc5.xlsx')

file_path_genenames = get_data_path(['input', 'other'], 'approved_and_previous_symbols.csv')

file_path_sample_info = get_data_path(['input', 'DepMap22Q4'], 'sample_info.csv')

file_path_processed_parrish_df = get_data_path(['output', 'processed_CRISPR_screens'], 'processed_parrish_df.csv')

In [3]:
#read the gene pair, GI and FDR scores from the supplementary tables in the paper

parrish_gi_and_fdr = pd.read_excel(file_path_gi_and_fdr, skiprows=1)

In [4]:
parrish_gi_and_fdr[:3]

Unnamed: 0,paralog_pair,target1,target2,GI_flag,PC9_GI_flag,PC9_GI_score_rank,PC9_GI_score,PC9_GI_fdr,HeLa_GI_flag,HeLa_GI_score_rank,HeLa_GI_score,HeLa_GI_fdr,PC9_DKO_expected_CS,PC9_DKO_observed_CS,HeLa_DKO_expected_CS,HeLa_DKO_observed_CS,same_chr,same_chr_dist,proximity
0,CCNL2|CCNL1,CCNL2,CCNL1,synthetic_lethal,SL_in_PC9,1,-2.313788,0.000165,SL_in_HeLa,10,-1.261084,0.026168,0.375079,-1.987383,-0.099315,-1.332104,False,,diff_chr
1,CDK6|CDK4,CDK6,CDK4,synthetic_lethal,SL_in_PC9,2,-1.537568,0.00034,neither_in_HeLa,210,-0.251834,0.307389,-2.309117,-3.408344,-0.451694,-0.632285,False,,diff_chr
2,GSK3B|GSK3A,GSK3B,GSK3A,synthetic_lethal,SL_in_PC9,3,-1.429767,0.001618,SL_in_HeLa,4,-1.65958,0.012451,-0.343343,-1.691436,0.649198,-1.073316,False,,diff_chr


In [5]:
parrish_gi_and_fdr = parrish_gi_and_fdr.rename(columns={'target1':'org_A1', 'target2':'org_A2'})
parrish_gi_and_fdr['paralog_pair'] = parrish_gi_and_fdr['paralog_pair'].str.replace('|', '_')
parrish_gi_and_fdr[:3]

Unnamed: 0,paralog_pair,org_A1,org_A2,GI_flag,PC9_GI_flag,PC9_GI_score_rank,PC9_GI_score,PC9_GI_fdr,HeLa_GI_flag,HeLa_GI_score_rank,HeLa_GI_score,HeLa_GI_fdr,PC9_DKO_expected_CS,PC9_DKO_observed_CS,HeLa_DKO_expected_CS,HeLa_DKO_observed_CS,same_chr,same_chr_dist,proximity
0,CCNL2_CCNL1,CCNL2,CCNL1,synthetic_lethal,SL_in_PC9,1,-2.313788,0.000165,SL_in_HeLa,10,-1.261084,0.026168,0.375079,-1.987383,-0.099315,-1.332104,False,,diff_chr
1,CDK6_CDK4,CDK6,CDK4,synthetic_lethal,SL_in_PC9,2,-1.537568,0.00034,neither_in_HeLa,210,-0.251834,0.307389,-2.309117,-3.408344,-0.451694,-0.632285,False,,diff_chr
2,GSK3B_GSK3A,GSK3B,GSK3A,synthetic_lethal,SL_in_PC9,3,-1.429767,0.001618,SL_in_HeLa,4,-1.65958,0.012451,-0.343343,-1.691436,0.649198,-1.073316,False,,diff_chr


### Map Gene Symbols to Entrez IDs

**Process FDR Data:**
- Map original gene symbols to Entrez IDs using the combined mapping dictionary
- Convert Entrez IDs back to approved gene symbols for consistency
- Remove rows where gene symbols cannot be mapped to Entrez IDs

In [6]:
# read the gene names mapping file
id_map = pd.read_csv(file_path_genenames)

# create dictionaries to map gene symbols to Entrez IDs
approved_sym_to_entrez_id = dict(zip(id_map['Approved symbol'], id_map['entrez_id']))

# create dictionaries to map previous gene symbols to Entrez IDs
id_map_cleaned = id_map.dropna(axis=0, how='any', subset=['Previous symbol', 'entrez_id']).reset_index(drop=True)
prev_sym_to_entrez_id = dict(zip(id_map_cleaned['Previous symbol'], id_map_cleaned['entrez_id']))

In [7]:
entrez_id_to_approved_sym = dict(zip(id_map['entrez_id'], id_map['Approved symbol']))

In [8]:
parrish_gi_and_fdr = parrish_gi_and_fdr.assign(
    A1_entrez = parrish_gi_and_fdr['org_A1'].map(approved_sym_to_entrez_id),
    A2_entrez = parrish_gi_and_fdr['org_A2'].map(approved_sym_to_entrez_id)
)

parrish_gi_and_fdr['A1_entrez'] = parrish_gi_and_fdr['A1_entrez'].fillna(parrish_gi_and_fdr['org_A1'].map(prev_sym_to_entrez_id))
parrish_gi_and_fdr['A2_entrez'] = parrish_gi_and_fdr['A2_entrez'].fillna(parrish_gi_and_fdr['org_A2'].map(prev_sym_to_entrez_id))

In [9]:
parrish_gi_and_fdr[['A1_entrez', 'A2_entrez']].isna().sum()

A1_entrez    0
A2_entrez    0
dtype: int64

In [10]:
parrish_gi_and_fdr.insert(1, 'A1', parrish_gi_and_fdr['A1_entrez'].map(entrez_id_to_approved_sym))
parrish_gi_and_fdr.insert(2, 'A2', parrish_gi_and_fdr['A2_entrez'].map(entrez_id_to_approved_sym))
parrish_gi_and_fdr = parrish_gi_and_fdr[['paralog_pair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'GI_flag',
       'PC9_GI_flag', 'PC9_GI_score_rank', 'PC9_GI_score', 'PC9_GI_fdr',
       'HeLa_GI_flag', 'HeLa_GI_score_rank', 'HeLa_GI_score', 'HeLa_GI_fdr',
       'PC9_DKO_expected_CS', 'PC9_DKO_observed_CS', 'HeLa_DKO_expected_CS',
       'HeLa_DKO_observed_CS', 'same_chr', 'same_chr_dist', 'proximity', 'org_A1', 'org_A2']]

In [11]:
list_c = [[x, y] for x, y in zip(parrish_gi_and_fdr.A1, parrish_gi_and_fdr.A2)]

genepairs = []
for pair in list_c:
    sorted_pair = natsorted(pair)
    genepairs.append(sorted_pair)

m = []
for i in range(0 , len(genepairs)):
    a = '_'.join(genepairs[i])
    m.append(a)

parrish_gi_and_fdr.insert(0, 'genepair', m, True)

In [12]:
# Call pair SL on a cell line specific basis
# Synthetic lethal paralogs were defined as those with a GI score <  0.5 and FDR < 0.1

parrish_gi_and_fdr_labeled = parrish_gi_and_fdr.assign(
    PC9_SL = (parrish_gi_and_fdr.PC9_GI_score < -0.5) & (parrish_gi_and_fdr.PC9_GI_fdr < 0.1),
    HeLa_SL = (parrish_gi_and_fdr.HeLa_GI_score < -0.5) & (parrish_gi_and_fdr.HeLa_GI_fdr < 0.1))
parrish_gi_and_fdr_labeled['n_SL'] = parrish_gi_and_fdr_labeled[['PC9_SL','HeLa_SL']].sum(axis=1)

# Check that this matches with annotations
assert(parrish_gi_and_fdr_labeled[parrish_gi_and_fdr_labeled.PC9_GI_flag=='SL_in_PC9'].shape[0] == sum(parrish_gi_and_fdr_labeled.PC9_SL))
assert(parrish_gi_and_fdr_labeled[parrish_gi_and_fdr_labeled.HeLa_GI_flag=='SL_in_HeLa'].shape[0] == sum(parrish_gi_and_fdr_labeled.HeLa_SL))
print('N pairs with SL flag:', parrish_gi_and_fdr_labeled[parrish_gi_and_fdr_labeled.GI_flag=='synthetic_lethal'].shape[0])

N pairs with SL flag: 122


In [13]:
parrish_gi_and_fdr_labeled.loc[parrish_gi_and_fdr_labeled['genepair'] == 'ADSS1_ADSS2']
#parrish_gi_and_fdr_labeled[:3]

Unnamed: 0,genepair,paralog_pair,A1,A2,A1_entrez,A2_entrez,GI_flag,PC9_GI_flag,PC9_GI_score_rank,PC9_GI_score,...,HeLa_DKO_expected_CS,HeLa_DKO_observed_CS,same_chr,same_chr_dist,proximity,org_A1,org_A2,PC9_SL,HeLa_SL,n_SL
250,ADSS1_ADSS2,ADSS_ADSSL1,ADSS2,ADSS1,159.0,122622.0,neither,neither_in_PC9,251,-0.241829,...,-1.051845,-1.275076,False,,diff_chr,ADSS,ADSSL1,False,False,0


In [14]:
# Function to sort each pair of gene symbols and their Entrez IDs
def sort_gene_pairs(row):
    # Sort the genes alphabetically and determine new order
    sorted_genes = natsorted([row['A1'], row['A2']])
    
    # Match the sorted genes to the original ones and rearrange Entrez IDs accordingly
    if sorted_genes[0] == row['A1']:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A1_entrez'], row['A2_entrez']])
    else:
        return pd.Series([sorted_genes[0], sorted_genes[1], row['A2_entrez'], row['A1_entrez']])

# Apply the sorting to each row - FIX: Use the correct dataframe
parrish_gi_and_fdr_labeled[['A1_sorted', 'A2_sorted', 'A1_entrez_sorted', 'A2_entrez_sorted']] = parrish_gi_and_fdr_labeled.apply(sort_gene_pairs, axis=1)

# Drop the old columns and rename the new ones
parrish_gi_and_fdr_sorted = parrish_gi_and_fdr_labeled.drop(columns=['A1', 'A2', 'A1_entrez', 'A2_entrez']).copy()
parrish_gi_and_fdr_sorted = parrish_gi_and_fdr_sorted.rename(columns={
    'A1_sorted': 'A1',
    'A2_sorted': 'A2',
    'A1_entrez_sorted': 'A1_entrez',
    'A2_entrez_sorted': 'A2_entrez'
})

In [15]:
parrish_gi_and_fdr_sorted.loc[parrish_gi_and_fdr_sorted['genepair'] == 'CDK4_CDK6']

Unnamed: 0,genepair,paralog_pair,GI_flag,PC9_GI_flag,PC9_GI_score_rank,PC9_GI_score,PC9_GI_fdr,HeLa_GI_flag,HeLa_GI_score_rank,HeLa_GI_score,...,proximity,org_A1,org_A2,PC9_SL,HeLa_SL,n_SL,A1,A2,A1_entrez,A2_entrez
1,CDK4_CDK6,CDK6_CDK4,synthetic_lethal,SL_in_PC9,2,-1.537568,0.00034,neither_in_HeLa,210,-0.251834,...,diff_chr,CDK6,CDK4,True,False,1,CDK4,CDK6,1019.0,1021.0


## Format the dataframe of interest

In [16]:
parrish_gi_and_fdr_sorted = parrish_gi_and_fdr_sorted[[
    'genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'paralog_pair', 'org_A1', 'org_A2',
    'PC9_GI_score', 'PC9_GI_fdr', 'HeLa_GI_score', 'HeLa_GI_fdr',
    'PC9_SL', 'HeLa_SL'
]]

In [17]:
parrish_gi_and_fdr_melted = pd.melt(parrish_gi_and_fdr_sorted,
                                    id_vars=['genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 'paralog_pair', 'org_A1', 'org_A2',
                                             'PC9_GI_score', 'PC9_GI_fdr', 'HeLa_GI_score', 'HeLa_GI_fdr'],
                                    value_vars=['PC9_SL', 'HeLa_SL'],
                                    var_name='metric',
                                    value_name='value')

In [18]:
parrish_gi_and_fdr_melted = parrish_gi_and_fdr_melted.sort_values(by='genepair', ascending=True).reset_index(drop=True)
parrish_gi_and_fdr_melted = parrish_gi_and_fdr_melted.rename(columns={'metric':'cell_line', 'value':'SL'})
parrish_gi_and_fdr_melted['cell_line'] = parrish_gi_and_fdr_melted['cell_line'].str.replace('_SL', '')

In [19]:
parrish_gi_and_fdr_melted['cell_line'] = parrish_gi_and_fdr_melted['cell_line'].str.replace('PC9', 'PC9_LUNG')
parrish_gi_and_fdr_melted['cell_line'] = parrish_gi_and_fdr_melted['cell_line'].str.replace('HeLa', 'HELA_CERVIX')

In [20]:
parrish_gi_and_fdr_melted.loc[parrish_gi_and_fdr_melted['genepair'] == 'ADSS1_ADSS2']

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,paralog_pair,org_A1,org_A2,PC9_GI_score,PC9_GI_fdr,HeLa_GI_score,HeLa_GI_fdr,cell_line,SL
52,ADSS1_ADSS2,ADSS1,ADSS2,122622.0,159.0,ADSS_ADSSL1,ADSS,ADSSL1,-0.241829,0.480277,-0.36762,0.104952,HELA_CERVIX,False
53,ADSS1_ADSS2,ADSS1,ADSS2,122622.0,159.0,ADSS_ADSSL1,ADSS,ADSSL1,-0.241829,0.480277,-0.36762,0.104952,PC9_LUNG,False


In [21]:
ID_to_cell_name = dict({'PC9_LUNG': 'ACH-000779', 'HELA_CERVIX': 'ACH-001086'}) 
parrish_gi_and_fdr_melted.insert(13, "DepMap_ID", parrish_gi_and_fdr_melted['cell_line'].map(ID_to_cell_name))

In [22]:
parrish_gi_and_fdr_melted[:4]

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,paralog_pair,org_A1,org_A2,PC9_GI_score,PC9_GI_fdr,HeLa_GI_score,HeLa_GI_fdr,cell_line,DepMap_ID,SL
0,A2M_PZP,A2M,PZP,2.0,5858.0,A2M_PZP,A2M,PZP,0.264313,0.138809,-0.15432,0.424612,PC9_LUNG,ACH-000779,False
1,A2M_PZP,A2M,PZP,2.0,5858.0,A2M_PZP,A2M,PZP,0.264313,0.138809,-0.15432,0.424612,HELA_CERVIX,ACH-001086,False
2,AADACL3_AADACL4,AADACL3,AADACL4,126767.0,343066.0,AADACL3_AADACL4,AADACL3,AADACL4,-0.000281,0.992873,0.120862,0.433194,HELA_CERVIX,ACH-001086,False
3,AADACL3_AADACL4,AADACL3,AADACL4,126767.0,343066.0,AADACL3_AADACL4,AADACL3,AADACL4,-0.000281,0.992873,0.120862,0.433194,PC9_LUNG,ACH-000779,False


In [23]:
parrish_pairs = parrish_gi_and_fdr_melted[['genepair', 'A1', 'A2', 'A1_entrez', 'A2_entrez', 
                               'PC9_GI_score', 'PC9_GI_fdr', 'HeLa_GI_score', 'HeLa_GI_fdr',
                               'DepMap_ID', 'cell_line', 'SL']]

In [24]:
parrish_pairs[:3]

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,PC9_GI_score,PC9_GI_fdr,HeLa_GI_score,HeLa_GI_fdr,DepMap_ID,cell_line,SL
0,A2M_PZP,A2M,PZP,2.0,5858.0,0.264313,0.138809,-0.15432,0.424612,ACH-000779,PC9_LUNG,False
1,A2M_PZP,A2M,PZP,2.0,5858.0,0.264313,0.138809,-0.15432,0.424612,ACH-001086,HELA_CERVIX,False
2,AADACL3_AADACL4,AADACL3,AADACL4,126767.0,343066.0,-0.000281,0.992873,0.120862,0.433194,ACH-001086,HELA_CERVIX,False


In [24]:
parrish_pairs.to_csv(file_path_processed_parrish_df, index=False)