In [1]:
# import modules
import os
import re
import pandas as pd
import numpy as np

In [2]:
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

# breast cancer cell lines
paralog_pairs_path = get_data_path(['data','input', 'other'], 'processed_DeKegel_TableS8.csv')
mutation_data_path = get_data_path(['data', 'input', 'DepMap22Q4'], 'OmicsSomaticMutations.csv')
model_info_path = get_data_path(['data','input', 'DepMap22Q4'], 'Model.csv')

In [3]:
model_info_df = pd.read_csv(model_info_path, low_memory=False)
model_info_df[:3]

Unnamed: 0,ModelID,PatientID,CellLineName,StrippedCellLineName,DepmapModelType,OncotreeLineage,OncotreePrimaryDisease,OncotreeSubtype,OncotreeCode,LegacyMolecularSubtype,...,TissueOrigin,CCLEName,CatalogNumber,PlateCoating,ModelDerivationMaterial,PublicComments,WTSIMasterCellID,SangerModelID,COSMICID,LegacySubSubtype
0,ACH-000001,PT-gj46wT,NIH:OVCAR-3,NIHOVCAR3,HGSOC,Ovary/Fallopian Tube,Ovarian Epithelial Tumor,High-Grade Serous Ovarian Cancer,HGSOC,,...,,NIHOVCAR3_OVARY,HTB-71,,,,2201.0,SIDM00105,905933.0,high_grade_serous
1,ACH-000002,PT-5qa3uk,HL-60,HL60,AML,Myeloid,Acute Myeloid Leukemia,Acute Myeloid Leukemia,AML,,...,,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,CCL-240,,,,55.0,SIDM00829,905938.0,M3
2,ACH-000003,PT-puKIyc,CACO2,CACO2,COAD,Bowel,Colorectal Adenocarcinoma,Colon Adenocarcinoma,COAD,,...,,CACO2_LARGE_INTESTINE,HTB-37,,,,,SIDM00891,,


In [4]:
breast_cancer_cell_lines = model_info_df.loc[model_info_df['OncotreeLineage'] == 'Breast', 'ModelID']
print(f'# of breast cancer cell lines:{len(breast_cancer_cell_lines)}')

# of breast cancer cell lines:94


In [5]:
paralog_pairs = pd.read_csv(paralog_pairs_path)

paralog_pairs_breast = paralog_pairs.loc[paralog_pairs.index.repeat(len(breast_cancer_cell_lines))].reset_index(drop=True)
breast_cancer_cell_lines_df = pd.concat([breast_cancer_cell_lines] * paralog_pairs['genepair'].nunique(), ignore_index=True)
paralog_pairs_breast.insert(paralog_pairs_breast.shape[1], 'DepMap_ID', breast_cancer_cell_lines_df)
paralog_pairs_breast.insert(paralog_pairs_breast.shape[1], 'cell_line', 
                                      paralog_pairs_breast['DepMap_ID'].map(dict(zip(model_info_df['ModelID'], model_info_df['StrippedCellLineName']))))
paralog_pairs_breast.insert(paralog_pairs_breast.shape[1], 'cancer_type', 'Breast')
display(paralog_pairs_breast.head())

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,gtex_max_mean_expr,A1_entrez_new,A2_entrez_new,A1_new,A2_new,A1_ensembl_new,A2_ensembl_new,DepMap_ID,cell_line,cancer_type
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,34.302868,6595.0,6597.0,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000017,SKBR3,Breast
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,34.302868,6595.0,6597.0,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000019,MCF7,Breast
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,34.302868,6595.0,6597.0,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000028,KPL1,Breast
3,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,34.302868,6595.0,6597.0,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000044,MDAMB134VI,Breast
4,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,34.302868,6595.0,6597.0,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000097,ZR751,Breast


In [6]:
# Filter mutations for the cell lines of interest
CCLE_mutations_raw = pd.read_csv(mutation_data_path, low_memory=False)
mutated_cell_lines = CCLE_mutations_raw[['HugoSymbol','EntrezGeneID', 'VariantInfo', 'LikelyLoF', 'CCLEDeleterious', 'DepMap_ID']]
mutated_cell_lines = mutated_cell_lines.rename(columns={'HugoSymbol':'symbol', 'EntrezGeneID':'entrez_id'})

breast_mutated_cell_lines = mutated_cell_lines.loc[mutated_cell_lines['DepMap_ID'].isin(paralog_pairs_breast['DepMap_ID']),]
display(breast_mutated_cell_lines.head())

Unnamed: 0,symbol,entrez_id,VariantInfo,LikelyLoF,CCLEDeleterious,DepMap_ID
15886,PRAMEF4,400735.0,MISSENSE,,,ACH-000117
15887,PRAMEF18,391003.0,MISSENSE,,,ACH-000117
15888,PRAMEF18,391003.0,MISSENSE,,,ACH-000117
15889,PRAMEF18,391003.0,SILENT,,,ACH-000117
15890,PRAMEF18,391003.0,SILENT,,,ACH-000117


In [7]:
breast_mutated_cell_lines_df = breast_mutated_cell_lines.loc[(breast_mutated_cell_lines['symbol'] == 'RB1') | (breast_mutated_cell_lines['symbol'] == 'PTEN') | (breast_mutated_cell_lines['symbol'] == 'PIK3CA'),]
print(f"# of breast cell lines with RB1 mutation: {breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df.symbol == 'RB1', ].shape[0]}")
print(f"# of breast cell lines with PIK3CA mutation: {breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df.symbol == 'PIK3CA', ].shape[0]}")
print(f"# of breast cell lines with PTEN mutation: {breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df.symbol == 'PTEN', ].shape[0]}")
breast_mutated_cell_lines_df.head()

# of breast cell lines with RB1 mutation: 6
# of breast cell lines with PIK3CA mutation: 27
# of breast cell lines with PTEN mutation: 14


Unnamed: 0,symbol,entrez_id,VariantInfo,LikelyLoF,CCLEDeleterious,DepMap_ID
15951,PIK3CA,5290.0,MISSENSE,,,ACH-000117
21847,PIK3CA,5290.0,MISSENSE,,,ACH-001393
27001,PIK3CA,5290.0,MISSENSE,,,ACH-000856
27257,PTEN,5728.0,FRAME_SHIFT_DEL,Y,Y,ACH-000856
27258,PTEN,5728.0,FRAME_SHIFT_INS,Y,Y,ACH-000856


In [8]:
def check_mutation_status(row):
    if row['symbol'] == 'RB1' and pd.notna(row['VariantInfo']):
         return 'RB1 mutated'
    elif row['symbol'] == 'BRAF' and pd.notna(row['VariantInfo']):
         return 'BRAF mutated'
    elif row['symbol'] == 'NRAS' and pd.notna(row['VariantInfo']):
         return 'NRAS mutated'
    elif row['symbol'] == 'KRAS' and pd.notna(row['VariantInfo']):
         return 'KRAS mutated'
    elif row['symbol'] == 'PIK3CA' and pd.notna(row['VariantInfo']):
          return 'PIK3CA mutated'
    elif row['symbol'] == 'PTEN' and pd.notna(row['VariantInfo']):
         return 'PTEN mutated'
    else:
         return 'No mutation'

In [9]:
breast_mutated_cell_lines_df = breast_mutated_cell_lines_df.copy()
breast_mutated_cell_lines_df['mutation_status'] = breast_mutated_cell_lines_df.apply(check_mutation_status, axis=1)
breast_mutated_cell_lines_df.sort_values(by=['DepMap_ID'], inplace=True)

In [10]:
breast_mutated_cell_lines_df

Unnamed: 0,symbol,entrez_id,VariantInfo,LikelyLoF,CCLEDeleterious,DepMap_ID,mutation_status
264757,PIK3CA,5290.0,MISSENSE,,,ACH-000019,PIK3CA mutated
1104279,PIK3CA,5290.0,MISSENSE,,,ACH-000028,PIK3CA mutated
69081,PTEN,5728.0,MISSENSE,,,ACH-000097,PTEN mutated
15951,PIK3CA,5290.0,MISSENSE,,,ACH-000117,PIK3CA mutated
345794,PIK3CA,5290.0,MISSENSE,,,ACH-000147,PIK3CA mutated
338982,PIK3CA,5290.0,MISSENSE,,,ACH-000276,PIK3CA mutated
1164218,PTEN,5728.0,FRAME_SHIFT_DEL,Y,Y,ACH-000288,PTEN mutated
294055,PIK3CA,5290.0,MISSENSE,,,ACH-000330,PIK3CA mutated
141976,RB1,5925.0,MISSENSE,,,ACH-000536,RB1 mutated
141701,PIK3CA,5290.0,MISSENSE,,,ACH-000536,PIK3CA mutated


In [11]:
breast_mutated_cell_lines_df = breast_mutated_cell_lines_df.drop_duplicates(subset=['DepMap_ID'], keep='first')
breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df['DepMap_ID'] == 'ACH-000536', 'mutation_status'] = 'RB1 mutated, PIK3CA mutated'
breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df['DepMap_ID'] == 'ACH-000668', 'mutation_status'] = 'RB1 mutated, PTEN mutated'
breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df['DepMap_ID'] == 'ACH-000856', 'mutation_status'] = 'PTEN mutated, PIK3CA mutated'
breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df['DepMap_ID'] == 'ACH-000902', 'mutation_status'] = 'PTEN mutated, PIK3CA mutated'
breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df['DepMap_ID'] == 'ACH-000910', 'mutation_status'] = 'PTEN mutated, PIK3CA mutated'
breast_mutated_cell_lines_df.loc[breast_mutated_cell_lines_df['DepMap_ID'] == 'ACH-001391', 'mutation_status'] = 'RB1 mutated, PIK3CA mutated'
dict_breast_mut_cell_line = dict(zip(breast_mutated_cell_lines_df['DepMap_ID'], breast_mutated_cell_lines_df['mutation_status']))

In [12]:
paralog_pairs_breast.insert(paralog_pairs_breast.shape[1], 'mutation_status', 
                            paralog_pairs_breast['DepMap_ID'].map(dict_breast_mut_cell_line))
paralog_pairs_breast['mutation_status'] = paralog_pairs_breast['mutation_status'].fillna('No mutation')

In [13]:
# label cell lines based on HER2 status
her2_neg = model_info_df.loc[(model_info_df['LegacySubSubtype'] == 'ERneg_HER2neg') |
                             (model_info_df['LegacySubSubtype'] == 'ERpos_HER2neg'),]

her2_pos = model_info_df.loc[(model_info_df['LegacySubSubtype'] == 'ERneg_HER2pos') |
                             (model_info_df['LegacySubSubtype'] == 'ERpos_HER2pos'),]

her2_neg_cell = her2_neg['ModelID'].unique()
her2_pos_cell = her2_pos['ModelID'].unique()   

# Define a function to label cells
def label_cell(cell_name):
    if cell_name in her2_neg_cell:
        return 'her2_neg'
    elif cell_name in her2_pos_cell:
        return 'her2_pos'
    else:
        return 'unknown'
    
paralog_pairs_breast['her_label'] = paralog_pairs_breast['DepMap_ID'].apply(label_cell)
paralog_pairs_breast['her_label'].value_counts()

her_label
her2_neg    1722456
unknown     1026144
her2_pos     696312
Name: count, dtype: int64

In [14]:
# label breast cancer cell lines based on RB1 deficiency status

# reference paper 
# https://www.nature.com/articles/s41388-018-0368-z

rb1_defects_dict = {"ACH-000258":"RB1 defective", 
                    "ACH-000849":"RB1 defective",
                    "ACH-000643":"RB1 defective",
                    "ACH-000288":"RB1 defective",
                    "ACH-000902":"RB1 defective",
                    "ACH-000223":"RB1 defective",
                    "ACH-000573":"RB1 defective",
                    "ACH-000857":"RB1 defective",
                    "ACH-001390":"RB1 defective",
                    "ACH-000111":"RB1 defective",
                    "ACH-002324":"RB1 defective",
                    "ACH-000768":"wt",
                    "ACH-000374":"wt",
                    "ACH-000148":"wt",
                    "ACH-000276":"wt",
                    "ACH-000536":"wt",
                    "ACH-000624":"wt",
                    "ACH-000910":"wt",
                    "ACH-000699":"wt",
                    "ACH-002328":"wt",
                    "ACH-002329":"wt",
                    "ACH-000212":"wt",
                    "ACH-002321":"wt",
                    "ACH-001819":"wt",
                    "ACH-000196":"wt",
                    "ACH-002322":"wt",
                    "ACH-002323":"wt",
                    "ACH-001388":"wt",
                    "ACH-001396":"wt",
                    "ACH-000668":"wt",
                    "ACH-001391":"wt",
                    "ACH-001825":"wt",
                    "ACH-002330":"wt",
                    "ACH-001827":"wt",
                    "ACH-001389":"wt",
                    "ACH-000856":"wt",
                    "ACH-001392":"wt",
                    "ACH-002319":"wt",
                    "ACH-002326":"wt",
                    "ACH-000621":"wt",
                    "ACH-001394":"wt"}

# mb157 - defective is missing

# Add a default label for missing entries
def label_rb1_status_with_dict(cell_name):
    # Use the dictionary to get the RB1 status, default to 'unknown' if not found
    return rb1_defects_dict.get(cell_name, 'unknown')

# Apply the function to your dataframe
paralog_pairs_breast_cancer_df = paralog_pairs_breast.copy()
paralog_pairs_breast_cancer_df['rb1_defects'] = paralog_pairs_breast_cancer_df['DepMap_ID'].apply(label_rb1_status_with_dict)
paralog_pairs_breast_cancer_df['rb1_defects'].value_counts()

rb1_defects
unknown          1978992
wt               1062792
RB1 defective     403128
Name: count, dtype: int64

In [15]:
print(paralog_pairs_breast_cancer_df.genepair.nunique())
display(paralog_pairs_breast_cancer_df['mutation_status'].value_counts())

36648


mutation_status
No mutation                     2198880
PIK3CA mutated                   659664
PTEN mutated                     293184
PTEN mutated, PIK3CA mutated     109944
RB1 mutated, PIK3CA mutated       73296
RB1 mutated                       73296
RB1 mutated, PTEN mutated         36648
Name: count, dtype: int64

In [16]:
paralog_pairs_breast_cancer_df[:3]

Unnamed: 0,prediction_rank,prediction_percentile,old_genepair,genepair,A1,A2,A1_entrez,A2_entrez,A1_ensembl,A2_ensembl,...,A1_new,A2_new,A1_ensembl_new,A2_ensembl_new,DepMap_ID,cell_line,cancer_type,mutation_status,her_label,rb1_defects
0,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000017,SKBR3,Breast,No mutation,her2_pos,unknown
1,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000019,MCF7,Breast,PIK3CA mutated,her2_neg,unknown
2,1,0.1,SMARCA2_SMARCA4,SMARCA2_SMARCA4,SMARCA2,SMARCA4,6595,6597,ENSG00000080503,ENSG00000127616,...,SMARCA2,SMARCA4,ENSG00000080503,ENSG00000127616,ACH-000028,KPL1,Breast,PIK3CA mutated,her2_neg,unknown


In [None]:
paralog_pairs_breast_cancer_df.to_csv(get_data_path(['data', 'output', 'breast_cancer'], 'paralog_pairs_breast_cancer_df.csv'), index=False)