### DepMap 22Q4 Data Preprocessing

This notebook processes data from the DepMap 22Q4 release to prepare standardized datasets for synthetic lethality prediction. It handles gene expression, copy number, gene essentiality, mutations, and sample information data.

**Inputs:**
- DepMap 22Q4 gene expression data (TPM log+1 transformed)
- DepMap 22Q4 copy number data
- DepMap 22Q4 CRISPR gene effect scores
- DepMap 22Q4 mutations
- DepMap 22Q4 common essential genes
- Gene symbol mapping file for standardization
- Sample information metadata

**Outputs:**
- Standardized gene expression data (raw and z-score normalized)
- Copy number data 
- Gene essentiality scores (raw and z-score normalized)
- Filtered mutation data with damaging variants
- Common essential genes list
- All datasets mapped to consistent Entrez IDs

### Setup and File Paths

**Import required libraries and set up file paths:**

In [1]:
# import modules
import os
import re
import pandas as pd
import numpy as np

In [2]:
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_gene_expression = get_data_path(['data', 'input', 'DepMap22Q4'], 'OmicsExpressionProteinCodingGenesTPMLogp1.csv')
file_path_gene_essentiality = get_data_path(['data','input', 'DepMap22Q4'], 'CRISPRGeneEffect.csv')
file_path_copy_number = get_data_path(['data', 'input', 'DepMap22Q4'], 'OmicsCNGene.csv')
file_path_mutation = get_data_path(['data', 'input', 'DepMap22Q4'], 'OmicsSomaticMutations.csv')
file_common_essential_genes = get_data_path(['data', 'input', 'DepMap22Q4'], 'CRISPRInferredCommonEssentials.csv')

file_path_genenames = get_data_path(['data', 'input', 'other'], 'approved_and_previous_symbols.csv')

file_path_sample_info = get_data_path(['data', 'input', 'DepMap22Q4'], 'sample_info.csv')

### Process Gene Expression Data

In [3]:
CCLE_expression_raw = pd.read_csv(file_path_gene_expression, index_col=0)

In [4]:
CCLE_expression_raw[:3]

Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),NFYA (4800),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,4.739848,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,3.481557,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
ACH-001339,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,3.273516,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644


In [5]:
get_entrez_id = lambda x : re.search(r'\((.*?)\)', x).group(1) #regex to fetch the entrez_id in the column
CCLE_expression = CCLE_expression_raw.rename(columns=get_entrez_id)
print('Number of genes:', CCLE_expression.shape[1], ', num of cell lines:', CCLE_expression.shape[0])

Number of genes: 19193 , num of cell lines: 1408


In [6]:
CCLE_expression[:2]

Unnamed: 0,7105,64102,8813,57147,55732,2268,3075,2519,2729,4800,...,8358,8352,8916,115253422,100533105,728929,2831,100506888,162699,1038
ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,4.739848,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,3.481557,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0


### Process Copy Number Data

In [7]:
CCLE_gene_cn_raw = pd.read_csv(file_path_copy_number, index_col=0)

In [8]:
CCLE_gene_cn_raw[:3]

Unnamed: 0,DDX11L1 (84771),WASH7P (653635),MIR6859-1 (102466751),MIR1302-2 (100302278),FAM138A (645520),OR4F5 (79501),WASH9P (102723897),MIR6859-2 (102465909),OR4F29 (729759),AL669831.3 (101928626),...,BPY2C (442868),TTTY4C (474150),CSPG4P1Y (114758),CDY1 (9085),TTTY3 (114760),SNORD38B (94163),SCARNA4 (677771),SNORA50A (677830),SNORD3D (780854),POLR2J3 (548644)
ACH-000759,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.285546,...,,,,,,,,,,
ACH-000681,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,...,,,,,,,,,,
ACH-000769,2.988233,2.988233,2.988233,2.988233,2.988233,2.988233,0.831376,0.831376,0.831376,0.831376,...,,,,,,,,,,


In [9]:
CCLE_gene_cn = CCLE_gene_cn_raw.rename(columns=get_entrez_id)
print('Number of genes:', CCLE_gene_cn.shape[1], ', num of cell lines:', CCLE_gene_cn.shape[0])

Number of genes: 25368 , num of cell lines: 1775


In [10]:
CCLE_gene_cn[:2]

Unnamed: 0,84771,653635,102466751,100302278,645520,79501,102723897,102465909,729759,101928626,...,442868,474150,114758,9085,114760,94163,677771,677830,780854,548644
ACH-000759,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.285546,...,,,,,,,,,,
ACH-000681,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,...,,,,,,,,,,


### Process Gene Essentiality Data

In [11]:
CRISPR_gene_effect_raw = pd.read_csv(file_path_gene_essentiality, index_col=0)

In [12]:
CRISPR_gene_effect_raw[:3]

Unnamed: 0,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),AADAC (13),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
ACH-000004,0.014633,-0.032777,-0.151299,-0.071388,0.046511,-0.16285,0.290698,-0.240991,0.17671,0.159418,...,-0.188857,-0.389649,0.112266,-0.002883,0.155729,0.077283,-0.294451,0.143978,0.197069,-0.003338
ACH-000005,-0.261566,0.174833,0.106526,0.135635,-0.076753,-0.27864,0.239279,-0.325967,-0.116848,0.022227,...,-0.195492,-0.360578,-0.126277,-0.059287,0.080543,-0.161894,-0.07023,-0.006275,0.002458,0.014259
ACH-000007,-0.028717,-0.117017,0.030971,0.083795,0.032668,-0.035709,0.012355,-0.192436,-0.077174,0.164877,...,-0.200402,-0.382707,0.006843,0.199553,0.064425,-0.031683,-0.291406,-0.065945,-0.260946,-0.329018


In [13]:
CRISPR_gene_effect = CRISPR_gene_effect_raw.rename(columns=get_entrez_id)
print('Number of genes:', CRISPR_gene_effect.shape[1], ', num of cell lines:', CRISPR_gene_effect.shape[0])

Number of genes: 17453 , num of cell lines: 1078


In [14]:
CRISPR_gene_effect[:2]

Unnamed: 0,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
ACH-000004,0.014633,-0.032777,-0.151299,-0.071388,0.046511,-0.16285,0.290698,-0.240991,0.17671,0.159418,...,-0.188857,-0.389649,0.112266,-0.002883,0.155729,0.077283,-0.294451,0.143978,0.197069,-0.003338
ACH-000005,-0.261566,0.174833,0.106526,0.135635,-0.076753,-0.27864,0.239279,-0.325967,-0.116848,0.022227,...,-0.195492,-0.360578,-0.126277,-0.059287,0.080543,-0.161894,-0.07023,-0.006275,0.002458,0.014259


### Process Mutation Data

In [15]:
CCLE_mutations_raw = pd.read_csv(file_path_mutation, low_memory=False)

In [16]:
CCLE_mutations_raw[:3]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,DepMap_ID,EntrezGeneID
0,chr1,1242864,GC,CT,0.31,19,8,0/1,,DNP,...,,,,,,,,,ACH-000839,388581.0
1,chr1,10647969,A,G,0.4,29,19,0|1,10647969.0,SNP,...,0.234,3.0,,,,,,,ACH-000839,54897.0
2,chr1,10648097,T,G,0.349,21,10,0/1,,SNP,...,,0.0,,,,,,,ACH-000839,54897.0


In [17]:
# Filter rows where CCLEDeleterious or LikelyLoF is 'Y'
CCLE_mutations = CCLE_mutations_raw[['HugoSymbol','VariantType', 'VariantInfo', 'CCLEDeleterious', 'LikelyLoF', 'DannScore', 'DepMap_ID', 'EntrezGeneID']]
CCLE_mutations = CCLE_mutations.dropna(subset=['EntrezGeneID'])
CCLE_mutations['EntrezGeneID'] = CCLE_mutations['EntrezGeneID'].astype(int)

In [18]:
CCLE_mutations['Damaging'] = np.where(
    (CCLE_mutations['CCLEDeleterious'] == 'Y') | (CCLE_mutations['LikelyLoF'] == 'Y'),
    1,
    0
)

In [19]:
CCLE_mutations[:3]

Unnamed: 0,HugoSymbol,VariantType,VariantInfo,CCLEDeleterious,LikelyLoF,DannScore,DepMap_ID,EntrezGeneID,Damaging
0,C1QTNF12,DNP,MISSENSE,,,,ACH-000839,388581,0
1,CASZ1,SNP,MISSENSE,,,0.731644,ACH-000839,54897,0
2,CASZ1,SNP,SILENT,,,0.711158,ACH-000839,54897,0


In [20]:
#CCLE_mutations.loc[CCLE_mutations.symbol == 'MUC12']

### Load Common Essential Genes

In [21]:
common_essentials_raw = pd.read_csv(file_common_essential_genes, index_col=0)

In [22]:
common_essentials = common_essentials_raw.rename(index=get_entrez_id)
print('Number of genes:', common_essentials.index.nunique())

Number of genes: 1855


### Load Sample Information

**Load cell line metadata:**
- DepMap_ID: Static primary key assigned by DepMap to each cell line
- CCLE_Name: Previous naming system using stripped cell line name + lineage

In [23]:
sample_info = pd.read_csv(file_path_sample_info, index_col=0, usecols= ['DepMap_ID', 'CCLE_Name'])

In [24]:
print('num of cell lines with Gene Exp TPM value:', CCLE_expression.shape[0])
print('num of cell lines:', sample_info.shape[0])
sample_info[:2]

num of cell lines with Gene Exp TPM value: 1408
num of cell lines: 1829


Unnamed: 0_level_0,CCLE_Name
DepMap_ID,Unnamed: 1_level_1
ACH-000001,NIHOVCAR3_OVARY
ACH-000002,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE


### Map Genes to Standard Identifiers

In [25]:
# read the gene names mapping file
id_map = pd.read_csv(file_path_genenames)
id_map['entrez_id'] = id_map['entrez_id'].astype(str)

In [26]:
id_map.head()

Unnamed: 0,Approved symbol,Previous symbol,entrez_id,ensembl_id,UniProt ID(supplied by UniProt)
0,A1BG,,1,ENSG00000121410,P04217
1,A1BG-AS1,NCRNA00181,503538,ENSG00000268895,
2,A1BG-AS1,A1BGAS,503538,ENSG00000268895,
3,A1BG-AS1,A1BG-AS,503538,ENSG00000268895,
4,A1CF,,29974,ENSG00000148584,Q9NQ94


In [27]:
# Filter gene expression data for genes with valid mappings
mapped_CCLE_expression = CCLE_expression.loc[:,CCLE_expression.columns.isin(id_map["entrez_id"])]
unmapped_CCLE_expression = CCLE_expression.loc[:,~CCLE_expression.columns.isin(id_map["entrez_id"])]
print(len(mapped_CCLE_expression.columns), "genes in CCLE_expression file are present in HGNC database with the same entrez_ID")
print(len(unmapped_CCLE_expression.columns), "genes are missing.")
mapped_CCLE_expression[:3]

19171 genes in CCLE_expression file are present in HGNC database with the same entrez_ID
22 genes are missing.


Unnamed: 0,7105,64102,8813,57147,55732,2268,3075,2519,2729,4800,...,8358,8352,8916,115253422,100533105,728929,2831,100506888,162699,1038
ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,4.739848,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,3.481557,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
ACH-001339,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,3.273516,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644


In [28]:
# Filter copy number data for genes with valid mappings
mapped_CCLE_gene_cn = CCLE_gene_cn.loc[:,CCLE_gene_cn.columns.isin(id_map["entrez_id"])]
unmapped_CCLE_gene_cn = CCLE_gene_cn.loc[:,~CCLE_gene_cn.columns.isin(id_map["entrez_id"])]
print(len(mapped_CCLE_gene_cn.columns), "genes in CCLE_gene_cn file are present in HGNC database with the same entrez_ID")
print(len(unmapped_CCLE_gene_cn.columns), "genes are missing.")
mapped_CCLE_gene_cn[:3]

24733 genes in CCLE_gene_cn file are present in HGNC database with the same entrez_ID
635 genes are missing.


Unnamed: 0,84771,653635,102466751,100302278,645520,79501,102465909,729759,81399,105378580,...,442868,474150,114758,9085,114760,94163,677771,677830,780854,548644
ACH-000759,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,0.589654,...,,,,,,,,,,
ACH-000681,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,...,,,,,,,,,,
ACH-000769,2.988233,2.988233,2.988233,2.988233,2.988233,2.988233,0.831376,0.831376,0.831376,0.831376,...,,,,,,,,,,


In [29]:
# Filter gene essentiality data for genes with valid mappings
mapped_CRISPR_gene_effect = CRISPR_gene_effect.loc[:,CRISPR_gene_effect.columns.isin(id_map["entrez_id"])]
unmapped_CRISPR_gene_effect = CRISPR_gene_effect.loc[:,~CRISPR_gene_effect.columns.isin(id_map["entrez_id"])]
print(len(mapped_CRISPR_gene_effect.columns), "genes in CRISPR_gene_effect file are present in HGNC database with the same entrez_ID")
print(len(unmapped_CRISPR_gene_effect.columns), "genes are missing.")
mapped_CRISPR_gene_effect[:3]

17453 genes in CRISPR_gene_effect file are present in HGNC database with the same entrez_ID
0 genes are missing.


Unnamed: 0,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
ACH-000004,0.014633,-0.032777,-0.151299,-0.071388,0.046511,-0.16285,0.290698,-0.240991,0.17671,0.159418,...,-0.188857,-0.389649,0.112266,-0.002883,0.155729,0.077283,-0.294451,0.143978,0.197069,-0.003338
ACH-000005,-0.261566,0.174833,0.106526,0.135635,-0.076753,-0.27864,0.239279,-0.325967,-0.116848,0.022227,...,-0.195492,-0.360578,-0.126277,-0.059287,0.080543,-0.161894,-0.07023,-0.006275,0.002458,0.014259
ACH-000007,-0.028717,-0.117017,0.030971,0.083795,0.032668,-0.035709,0.012355,-0.192436,-0.077174,0.164877,...,-0.200402,-0.382707,0.006843,0.199553,0.064425,-0.031683,-0.291406,-0.065945,-0.260946,-0.329018


In [30]:
# Filter mutation data for genes with valid mappings
CCLE_mutations = CCLE_mutations.rename(columns={'EntrezGeneID': 'entrez_id'})
CCLE_mutations['entrez_id'] = CCLE_mutations['entrez_id'].astype(str)

filtered_CCLE_mutations = CCLE_mutations.loc[CCLE_mutations['entrez_id'].isin(id_map['entrez_id']),:]

print('Number of cell lines:', len(np.unique(filtered_CCLE_mutations.DepMap_ID)))
print('Number of genes:', len(np.unique(filtered_CCLE_mutations.entrez_id)))

filtered_CCLE_mutations.sort_values(by=['entrez_id']).reset_index(drop=True)[:4]

Number of cell lines: 1702
Number of genes: 19613


Unnamed: 0,HugoSymbol,VariantType,VariantInfo,CCLEDeleterious,LikelyLoF,DannScore,DepMap_ID,entrez_id,Damaging
0,A1BG,SNP,MISSENSE,,,0.69924,ACH-000281,1,0
1,A1BG,SNP,SILENT,,,0.597227,ACH-000876,1,0
2,A1BG,SNP,SILENT,,Y,0.969591,ACH-002166,1,1
3,A1BG,SNP,MISSENSE,,,0.478322,ACH-000865,1,0


In [31]:
# check the entrez_id of common essentials 
mapped_common_essentials = common_essentials.loc[common_essentials.index.isin(id_map['entrez_id'])]
len(mapped_common_essentials.index) == len(common_essentials.index)

True

### Handle Duplicate Genes

**Check for and handle duplicate Entrez IDs in expression data:**
- Some genes may appear multiple times due to different symbols mapping to same Entrez ID

In [32]:
# check the uniqueness of CCLE expression dataset
f = mapped_CCLE_expression.columns.nunique()
print(f, "genes out of", len(mapped_CCLE_expression.columns), "are unique")
print(len(mapped_CCLE_expression.columns)-f, "genes are duplicated")

19154 genes out of 19171 are unique
17 genes are duplicated


In [33]:
duplicated_CCLE_expression = mapped_CCLE_expression.loc[:,mapped_CCLE_expression.columns.duplicated()] #duplicated genes
cleared_mapped_CCLE_expression = mapped_CCLE_expression.loc[:,~mapped_CCLE_expression.columns.isin(duplicated_CCLE_expression.columns)] #drop duplicated genes

In [34]:
duplicates = id_map.loc[id_map["entrez_id"].isin(duplicated_CCLE_expression.columns), ["Approved symbol", "entrez_id"]]
duplicates["Format"] = duplicates["Approved symbol"] + " " + "(" + duplicates["entrez_id"] + ")"
duplicates.head()

Unnamed: 0,Approved symbol,entrez_id,Format
2881,BOLA2,552900,BOLA2 (552900)
5095,CHN2,1124,CHN2 (1124)
6037,CRHR1,1394,CRHR1 (1394)
12681,HERC3,8916,HERC3 (8916)
15956,KYAT1,883,KYAT1 (883)


In [35]:
mapped_duplicates = CCLE_expression_raw.loc[:,CCLE_expression_raw.columns.isin(duplicates.Format)]
mapped_duplicates = mapped_duplicates.rename(columns=get_entrez_id)

In [36]:
# concat two dataset as df
expression_data = pd.concat([cleared_mapped_CCLE_expression, mapped_duplicates], axis=1)
print("Is dataset cleared from duplicates?", expression_data.columns.is_unique)
print("Number of unique genes in the dataset:", len(np.unique(expression_data.columns)))

Is dataset cleared from duplicates? True
Number of unique genes in the dataset: 19154


In [37]:
# check the uniqueness of CCLE gene copy number dataset
g = np.unique(mapped_CCLE_gene_cn.columns)
print(len(g), "genes out of", len(mapped_CCLE_gene_cn.columns), "are unique")
print(len(mapped_CCLE_gene_cn.columns)-len(g), "genes are duplicated")

24703 genes out of 24733 are unique
30 genes are duplicated


In [38]:
duplicated_gene_cn = mapped_CCLE_gene_cn.loc[:,mapped_CCLE_gene_cn.columns.duplicated()] #duplicated genes
cleared_gene_cn = mapped_CCLE_gene_cn.loc[:,~mapped_CCLE_gene_cn.columns.isin(duplicated_gene_cn.columns)] #drop duplicated genes

In [39]:
duplicate_gene_cn = id_map.loc[id_map["entrez_id"].isin(duplicated_CCLE_expression.columns), ["Approved symbol", "entrez_id"]]
duplicate_gene_cn["Format"] = duplicate_gene_cn["Approved symbol"] + " " + "(" + duplicate_gene_cn["entrez_id"] + ")"
duplicate_gene_cn.head()

Unnamed: 0,Approved symbol,entrez_id,Format
2881,BOLA2,552900,BOLA2 (552900)
5095,CHN2,1124,CHN2 (1124)
6037,CRHR1,1394,CRHR1 (1394)
12681,HERC3,8916,HERC3 (8916)
15956,KYAT1,883,KYAT1 (883)


In [40]:
mapped_duplicates_gene_cn = CCLE_gene_cn_raw.loc[:,CCLE_gene_cn_raw.columns.isin(duplicate_gene_cn.Format)]
mapped_duplicates_gene_cn = mapped_duplicates_gene_cn.rename(columns=get_entrez_id)

In [41]:
# concat two dataset as df
copy_number_data = pd.concat([cleared_gene_cn, mapped_duplicates_gene_cn], axis=1)
print("Is dataset cleared from duplicates?", copy_number_data.columns.is_unique)
print("Number of unique genes in the dataset:", len(np.unique(copy_number_data.columns)))

Is dataset cleared from duplicates? True
Number of unique genes in the dataset: 24690


In [42]:
# check the uniqueness of CRISPR gene effect
f = mapped_CRISPR_gene_effect.columns.nunique()
print(f, "genes out of", len(mapped_CRISPR_gene_effect.columns), "are unique")
print(len(mapped_CRISPR_gene_effect.columns)-f, "genes are duplicated")

17453 genes out of 17453 are unique
0 genes are duplicated


### Calculate Z-Score Normalization

**Normalize gene expression data:**
- Z-score normalization enables comparison across genes with different expression ranges

In [43]:
# calculate z-score
zexpression_data = expression_data.apply(lambda x: ((x-x.mean())/x.std(ddof=0)))

**Normalize gene essentiality data:**

In [44]:
# calculate z-score
zCRISPR_gene_effect = mapped_CRISPR_gene_effect.apply(lambda x: ((x-x.mean())/x.std(ddof=0)))

### Add Cell Line Names

**Map DepMap IDs to cell line names for all datasets:**

In [45]:
# add CCLE_Name as a new column by mapping
ID_to_cell_name = dict(zip(sample_info.index, sample_info.CCLE_Name))

In [46]:
expression_data.insert(0, "cell_name", expression_data.index.map(ID_to_cell_name))
expression_data[:3]

Unnamed: 0,cell_name,7105,64102,8813,57147,55732,2268,3075,2519,2729,...,8916,29970,255027,883,221468,55486,552900,220074,440519,284391
ACH-001113,LC1SQSF_LUNG,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,...,1.367371,1.201634,0.0,1.659925,0.286881,6.397974,4.149747,2.831877,2.153805,0.0
ACH-001289,COGAR359_SOFT_TISSUE,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,...,3.053111,6.17971,0.748461,3.019702,1.807355,5.778734,3.134221,3.689299,1.669027,1.485427
ACH-001339,COLO794_SKIN,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,...,2.596935,1.0,0.0,1.815575,0.62293,6.20418,3.363171,1.895303,0.15056,1.269033


In [47]:
zexpression_data.insert(0, "cell_name", zexpression_data.index.map(ID_to_cell_name))
zexpression_data[:3]

Unnamed: 0,cell_name,7105,64102,8813,57147,55732,2268,3075,2519,2729,...,8916,29970,255027,883,221468,55486,552900,220074,440519,284391
ACH-001113,LC1SQSF_LUNG,0.588734,-0.202156,1.341424,0.782413,1.015674,-0.333763,-0.419278,-1.154559,1.615601,...,-1.225467,-0.14346,-0.692331,-1.157672,-0.770213,1.149857,1.662524,-0.122769,0.97997,-1.078924
ACH-001289,COGAR359_SOFT_TISSUE,0.731947,1.493293,0.94251,0.325524,-0.214189,-0.356637,-0.882359,-0.73089,-0.360896,...,0.631138,3.312879,0.182974,0.446167,1.850917,0.02758,0.242377,1.015032,0.405184,0.672085
ACH-001339,COLO794_SKIN,-0.129934,-0.202156,1.363777,-0.059381,0.706304,-0.311333,-0.38186,0.852051,-0.828794,...,0.128724,-0.283457,-0.692331,-0.974084,-0.190901,0.798635,0.562549,-1.365604,-1.395212,0.417002


In [48]:
# add CCLE_Name as a new column by mapping
copy_number_data.insert(0, "cell_name", copy_number_data.index.map(ID_to_cell_name))
copy_number_data[:3]

Unnamed: 0,cell_name,84771,653635,102466751,100302278,645520,79501,102465909,729759,81399,...,440519,29970,55486,8916,8622,221468,1124,6013,11046,883
ACH-000759,MDAMB175VII_BREAST,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,1.675758,...,1.022433,1.080296,1.067112,1.018016,1.016425,0.847287,1.286626,0.896685,0.853836,1.160203
ACH-000681,A549_LUNG,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,0.775472,...,1.174576,1.113181,0.812579,0.839096,1.097764,0.79787,1.115526,1.37356,0.811848,1.034136
ACH-000769,LK2_LUNG,2.988233,2.988233,2.988233,2.988233,2.988233,2.988233,0.831376,0.831376,0.831376,...,0.80364,1.37548,1.37548,0.884088,0.875004,0.848061,1.155236,1.176019,1.152539,1.120476


In [49]:
# add CCLE_Name as a new column by mapping
mapped_CRISPR_gene_effect.insert(0, "cell_name", mapped_CRISPR_gene_effect.index.map(ID_to_cell_name))
mapped_CRISPR_gene_effect[:3]

Unnamed: 0,cell_name,1,29974,2,144568,127550,53947,51146,8086,65985,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
ACH-000004,HEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.014633,-0.032777,-0.151299,-0.071388,0.046511,-0.16285,0.290698,-0.240991,0.17671,...,-0.188857,-0.389649,0.112266,-0.002883,0.155729,0.077283,-0.294451,0.143978,0.197069,-0.003338
ACH-000005,HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,-0.261566,0.174833,0.106526,0.135635,-0.076753,-0.27864,0.239279,-0.325967,-0.116848,...,-0.195492,-0.360578,-0.126277,-0.059287,0.080543,-0.161894,-0.07023,-0.006275,0.002458,0.014259
ACH-000007,LS513_LARGE_INTESTINE,-0.028717,-0.117017,0.030971,0.083795,0.032668,-0.035709,0.012355,-0.192436,-0.077174,...,-0.200402,-0.382707,0.006843,0.199553,0.064425,-0.031683,-0.291406,-0.065945,-0.260946,-0.329018


In [50]:
zCRISPR_gene_effect.insert(0, "cell_name", zCRISPR_gene_effect.index.map(ID_to_cell_name))
zCRISPR_gene_effect[:3]

Unnamed: 0,cell_name,1,29974,2,144568,127550,53947,51146,8086,65985,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
ACH-000004,HEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.57599,0.134966,-1.846143,-1.416198,0.995485,-0.954077,2.368073,0.1377,1.605523,...,-0.114486,0.495168,1.185707,-0.371118,1.466923,1.176361,-1.232019,1.494891,2.65038,1.856408
ACH-000005,HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,-2.213047,1.492787,0.998212,0.570649,0.045985,-2.069485,1.892161,-0.33478,-1.093151,...,-0.152088,0.588743,-1.177411,-0.836542,0.824541,-1.331145,0.64192,0.174599,1.005407,1.968815
ACH-000007,LS513_LARGE_INTESTINE,0.138239,-0.415983,0.164679,0.073134,0.888849,0.270677,-0.208148,0.407672,-0.728426,...,-0.179913,0.517514,0.141338,1.299305,0.686829,0.03397,-1.206567,-0.349732,-1.221051,-0.224039


### Save Processed Data

**Export all processed datasets:**

In [None]:
output_path = get_data_path(['data', 'output', 'processed_DepMap22Q4'], '')

expression_data.to_csv(os.path.join(output_path, 'expression_data.csv'))
zexpression_data.to_csv(os.path.join(output_path, 'zexpression_data.csv'))
mapped_CRISPR_gene_effect.to_csv(os.path.join(output_path, 'gene_effect_data.csv'))
zCRISPR_gene_effect.to_csv(os.path.join(output_path, 'zgene_effect_data.csv'))
copy_number_data.to_csv(os.path.join(output_path, 'copy_number_data.csv'))
filtered_CCLE_mutations.to_csv(os.path.join(output_path, 'mutation_data.csv'), index=False)
mapped_common_essentials.to_csv(os.path.join(output_path, 'common_essentials.csv'))