# Notebook for preparing the Cell line feature data

### 1. Read the pathways and genes for PID c2 curated pathways

In [2]:
import json
with open("processed_data/pathway-interaction-database-pathways-and-genes.json", "r") as f:
    pid_pathways = json.load(f)
genes = []
for gene in pid_pathways.values():
    genes.extend(gene)

genes = set(genes)

### 2. Read the cell line data, and filter out only the GDSC2 cell lines

In [3]:
import pandas as pd
cell_line_mapping = pd.read_csv("data/Cell_listMon Jan 13 01_52_56 2025.csv")
cell_line_mapping.columns=cell_line_mapping.columns.str.strip()
cell_line_mapping = cell_line_mapping[cell_line_mapping.Datasets.eq("GDSC2")].drop_duplicates(["Model ID", "COSMIC ID"])
cell_line_mapping

Unnamed: 0,Cell line Name,Model ID,COSMIC ID,TCGA Classfication,Tissue,Tissue sub-type,Datasets,number of drugs
1,22RV1,SIDM00499,924100,PRAD,urogenital_system,prostate,GDSC2,282
3,23132-87,SIDM00980,910924,STAD,digestive_system,stomach,GDSC2,281
5,42-MG-BA,SIDM00982,687561,GBM,nervous_system,glioma,GDSC2,281
7,451Lu,SIDM01240,1287706,SKCM,skin,melanoma,GDSC2,180
9,5637,SIDM00807,687452,BLCA,urogenital_system,bladder,GDSC2,279
...,...,...,...,...,...,...,...,...
1930,YT,SIDM00410,946358,ALL,blood,lymphoid_neoplasm_other,GDSC2,167
1932,ZR-75-30,SIDM00971,909907,BRCA,breast,breast,GDSC2,286
1934,huH-1,SIDM00586,1298146,LIHC,digestive_system,liver,GDSC2,281
1936,no-10,SIDM00574,908452,LGG,nervous_system,glioma,GDSC2,180


### 3. Read the GDSC2 drug data response data and remove the cell lines for which drug response is missing

In [4]:
drug = pd.read_csv("processed_data-old/gdsc2_drug_ic50_feature.csv", index_col=0).set_index(["COSMIC_ID", "DRUG_ID"])
drug

Unnamed: 0_level_0,Unnamed: 1_level_0,LN_IC50,bit_0,bit_1,bit_2,bit_3,bit_4,bit_5,bit_6,bit_7,bit_8,...,bit_246,bit_247,bit_248,bit_249,bit_250,bit_251,bit_252,bit_253,bit_254,bit_255
COSMIC_ID,DRUG_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
683667,1003,-1.463887,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
684052,1003,-4.869455,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
684057,1003,-3.360586,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
684059,1003,-5.044940,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
684062,1003,-3.741991,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659928,2359,5.409627,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
1660034,2359,5.035265,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
1660035,2359,6.119660,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
1674021,2359,6.135335,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1


In [5]:
cell_line_mapping = cell_line_mapping[cell_line_mapping["COSMIC ID"].isin(drug.index.get_level_values(level=0).unique())]
cell_line_mapping

Unnamed: 0,Cell line Name,Model ID,COSMIC ID,TCGA Classfication,Tissue,Tissue sub-type,Datasets,number of drugs
1,22RV1,SIDM00499,924100,PRAD,urogenital_system,prostate,GDSC2,282
3,23132-87,SIDM00980,910924,STAD,digestive_system,stomach,GDSC2,281
5,42-MG-BA,SIDM00982,687561,GBM,nervous_system,glioma,GDSC2,281
7,451Lu,SIDM01240,1287706,SKCM,skin,melanoma,GDSC2,180
9,5637,SIDM00807,687452,BLCA,urogenital_system,bladder,GDSC2,279
...,...,...,...,...,...,...,...,...
1930,YT,SIDM00410,946358,ALL,blood,lymphoid_neoplasm_other,GDSC2,167
1932,ZR-75-30,SIDM00971,909907,BRCA,breast,breast,GDSC2,286
1934,huH-1,SIDM00586,1298146,LIHC,digestive_system,liver,GDSC2,281
1936,no-10,SIDM00574,908452,LGG,nervous_system,glioma,GDSC2,180


### 4. Read the RNA-seq data, and keep only the Sanger source, pathway genes, filtered cell lines, and apply log transformation on tpm values

In [7]:
import pandas as pd

rna = pd.read_csv("data-old/rnaseq_all_20220624/rnaseq_all_data_20220624.csv")
rna

Unnamed: 0,dataset_id,id,model_id,gene_id,read_count,fpkm,tpm,data_source,dataset_name,model_name,gene_symbol
0,22,133594790,SIDM01313,SIDG03516,919,5.69,14.41,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,CASP10
1,22,133630300,SIDM01313,SIDG21420,90,0.25,0.64,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,NBPF10
2,22,133630301,SIDM01313,SIDG31799,1,0.15,0.39,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,RPL17P51
3,22,133630302,SIDM01313,SIDG25351,0,0.00,0.00,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,PPATP2
4,22,133630303,SIDM01313,SIDG19863,88,1.31,3.32,Sanger,Sanger Organoid RNASeq,HCM-SANG-0285-C18,MMP28
...,...,...,...,...,...,...,...,...,...,...,...
53348464,15,94861057,SIDM00410,SIDG00040,5250,7.20,26.88,Sanger,Sanger & Broad Cell Lines RNASeq,YT,AASDH
53348465,15,94861058,SIDM00410,SIDG00041,14828,15.30,57.12,Sanger,Sanger & Broad Cell Lines RNASeq,YT,AASDHPPT
53348466,15,94861059,SIDM00410,SIDG00042,689,0.96,3.58,Sanger,Sanger & Broad Cell Lines RNASeq,YT,AASS
53348467,15,94861060,SIDM00410,SIDG00043,2,0.00,0.00,Sanger,Sanger & Broad Cell Lines RNASeq,YT,AATBC


In [6]:
import numpy as np 

filtered_rna = (
    rna.loc[(rna["dataset_name"].eq('Sanger & Broad Cell Lines RNASeq'))
    & (rna["data_source"].str.lower().str.strip()=="sanger")
    & (rna["gene_symbol"].isin(genes))]
)
filtered_rna_matrix = ( 
    filtered_rna.pivot(index=["model_id"], columns="gene_symbol", values="tpm")
    .apply(lambda tpm: np.log2(tpm+1))
    .merge(cell_line_mapping[["Model ID", "COSMIC ID"]], 
                              left_index=True, 
                              right_on="Model ID")
    .set_index("COSMIC ID")
    .drop(columns="Model ID")
    .sort_index(level=[0, 1])
)

filtered_rna_matrix.to_csv("processed_data/gdsc_sanger_pid_gene_expression.csv")
filtered_rna_matrix

Unnamed: 0_level_0,A2M,AATF,ABCA1,ABCA3,ABCB1,ABCC1,ABCC8,ABCG2,ABI1,ABI2,...,ZFYVE28,ZFYVE9,ZMIZ1,ZMIZ2,ZNF14,ZNF274,ZNF318,ZNF385A,ZNFX1,ZYX
COSMIC ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
683667,0.985500,5.511911,3.001802,6.301222,0.214125,3.028569,0.286881,0.176323,5.021480,4.265287,...,1.298658,4.192194,5.343052,5.365623,2.435629,3.353323,4.608809,4.221877,4.550901,6.972118
684057,4.787641,5.693487,1.851999,5.195348,0.056584,6.371907,0.056584,0.575312,5.972922,5.410070,...,1.232661,6.139347,4.969933,5.736064,4.708739,4.285402,5.404631,6.170726,5.175924,6.979339
684059,3.959770,5.727648,2.819668,4.549669,0.000000,6.173127,0.000000,0.056584,6.375387,4.743623,...,1.150560,5.805292,5.018367,6.306517,3.235727,4.093391,5.332708,5.511911,5.649328,9.576163
684062,0.111031,5.959538,0.443607,3.771886,0.310340,7.054848,0.056584,0.056584,6.273516,5.510012,...,2.592158,5.976822,4.221104,6.206526,3.750607,3.513491,5.081936,5.562242,4.615887,7.507478
687457,0.000000,6.324091,0.286881,2.695994,0.495695,6.880073,0.201634,3.583760,5.720005,4.053980,...,1.500802,5.667892,6.420718,5.986411,3.206331,3.090853,4.738768,7.316508,5.848498,6.916118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659929,0.000000,4.767125,3.568032,4.778734,3.016140,4.705425,0.150560,0.000000,5.988457,2.773996,...,2.482848,3.370164,4.742545,6.061344,2.684819,2.839960,4.145677,0.722466,4.755956,5.938286
1660034,0.000000,5.848247,0.176323,1.269033,3.072106,5.320485,0.000000,2.675816,6.156032,3.158660,...,0.773996,4.013462,2.871844,5.559186,0.000000,2.885574,4.165108,5.710118,4.583760,7.139756
1660036,0.000000,6.139756,0.250962,3.601697,0.042644,5.720005,0.084064,1.130931,5.651626,2.403268,...,1.769772,4.391630,4.842476,6.130313,3.121015,4.007196,4.894818,6.056800,4.442280,8.149137
1674021,0.000000,5.718636,1.646163,4.034744,0.389567,5.713971,0.042644,2.648465,5.822985,3.750607,...,2.295723,4.678635,5.049631,6.235536,3.058316,2.403268,5.307793,4.951868,5.738768,7.791489


### 5. Read the mutation data, keep the sanger source, pathway genes, and filtered cell lines, and convert to binary (0,1)

In [7]:
mutation = pd.read_csv("data/mutations_all_20230202.csv")

filtered_mutation = (
    mutation[mutation["source"].str.lower().eq("sanger")
    & mutation["gene_symbol"].isin(genes)]
).drop_duplicates(["model_id", "gene_symbol"])

filtered_mutation_matrix = (
    pd.crosstab(filtered_mutation["model_id"], filtered_mutation["gene_symbol"])
    .merge(cell_line_mapping[["Model ID", "COSMIC ID"]], left_index=True, right_on="Model ID")
    .set_index("COSMIC ID")
    .drop(columns="Model ID")
    .sort_index(level=[0, 1])
)
filtered_mutation_matrix.to_csv("processed_data/gdsc_sanger_pid_mutation.csv")
filtered_mutation_matrix

Unnamed: 0_level_0,A2M,AATF,ABCA1,ABCA3,ABCB1,ABCC1,ABCC8,ABCG2,ABI1,ABI2,...,ZFYVE28,ZFYVE9,ZMIZ1,ZMIZ2,ZNF14,ZNF274,ZNF318,ZNF385A,ZNFX1,ZYX
COSMIC ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
683667,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
684052,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
684057,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
684059,0,1,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
684062,1,0,1,1,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1660034,0,0,1,1,1,1,0,0,1,0,...,0,1,1,0,0,0,1,0,0,0
1660035,1,1,1,1,1,0,0,0,1,0,...,1,1,1,1,0,0,0,0,1,0
1660036,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
1674021,1,0,1,1,1,0,1,1,1,0,...,0,1,1,1,1,1,1,0,1,1


### 6. Read CNV data for Sanger source, pathway genes, and filtered cell lines.
Apply GISTIC2 threshold:
- Deletion: -2
- Loss: -1
- Neutral: 0
- Gain: 1
- Amplification: 2

In [7]:
cnv = pd.read_csv("data/WES_pureCN_CNV_genes_20221213 2/WES_pureCN_CNV_genes_20221213.csv")
cnv

  cnv = pd.read_csv("data/WES_pureCN_CNV_genes_20221213 2/WES_pureCN_CNV_genes_20221213.csv")


Unnamed: 0,model_name,model_id,symbol,cancer_driver,gene_id,chr_name,chr_start,chr_end,total_copy_number,minor_copy_number,...,seg_mean,gene_mean,num_targets,focal,breakpoints,num_snps,gatk_mean_log2_copy_ratio,comment,source,data_type
0,MEC-1,SIDM00001,DYNLRB2,False,SIDG07193,chr16,80540800,80550835,2.0,1.0,...,0.097875,0.801251,4.0,False,0.0,129.0,-0.063697,POOR GOF (57.6%);NOISY LOG-RATIO;NOISY SEGMENT...,Broad,WES
1,MEC-1,SIDM00001,DONSON,False,SIDG06885,chr21,33575355,33588918,2.0,1.0,...,0.277914,0.003501,13.0,False,0.0,295.0,0.109463,POOR GOF (57.6%);NOISY LOG-RATIO;NOISY SEGMENT...,Broad,WES
2,MEC-1,SIDM00001,DNAJC16,False,SIDG06761,chr1,15528860,15568454,2.0,1.0,...,0.097875,-0.016074,14.0,False,0.0,697.0,-0.075368,POOR GOF (57.6%);NOISY LOG-RATIO;NOISY SEGMENT...,Broad,WES
3,MEC-1,SIDM00001,DGKD,False,SIDG06497,chr2,233387980,233469737,2.0,1.0,...,0.057769,0.142169,29.0,False,0.0,103.0,-0.147871,POOR GOF (57.6%);NOISY LOG-RATIO;NOISY SEGMENT...,Broad,WES
4,MEC-1,SIDM00001,DEFB124,False,SIDG06329,chr20,31465229,31473290,2.0,1.0,...,0.101180,0.205278,2.0,False,0.0,133.0,-0.070772,POOR GOF (57.6%);NOISY LOG-RATIO;NOISY SEGMENT...,Broad,WES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24488099,EA-hy926,SIDM01986,AIF1,False,SIDG00678,chr6,31615053,31617177,2.0,1.0,...,-0.543072,-0.426866,6.0,False,0.0,1337.0,-0.521534,POOR GOF (75%),Sanger,WES
24488100,EA-hy926,SIDM01986,AIFM3,False,SIDG00683,chr22,20967668,20981308,3.0,1.0,...,-0.183680,-0.114270,17.0,False,0.0,1033.0,-0.196088,POOR GOF (75%),Sanger,WES
24488101,EA-hy926,SIDM01986,AIG1,False,SIDG00684,chr6,143060677,143339953,3.0,1.0,...,-0.158810,-0.181585,6.0,False,0.0,1503.0,-0.125547,POOR GOF (75%),Sanger,WES
24488102,EA-hy926,SIDM01986,AIM2,False,SIDG00687,chr1,159062415,159073776,3.0,1.0,...,-0.183680,-0.169099,5.0,False,0.0,3532.0,-0.158518,POOR GOF (75%),Sanger,WES


In [9]:
filtered_cnv_matrix = (
    cnv[(cnv["source"].str.lower().str.strip()=="sanger") & (cnv["symbol"].isin(genes))]
    .pivot(columns=["symbol"], index="model_id", values="cn_category")
    .replace({"Deletion": -2, "Loss": -1, "Neutral": 0, "Gain": 1,  "Amplification": 2})
    .merge(cell_line_mapping[["Model ID", "COSMIC ID"]], left_index=True, right_on="Model ID")
    .set_index("COSMIC ID")
    .drop(columns="Model ID")
    .sort_index(level=[0, 1])
)

# filtered_cnv_matrix = filtered_cnv_matrix.loc[:,filtered_cnv_matrix.notna().all()]
# filtered_cnv_matrix.to_csv("processed_data/gdsc_sanger_pid_cnv.csv")
filtered_cnv_matrix

Unnamed: 0_level_0,A2M,AATF,ABCA1,ABCA3,ABCB1,ABCC1,ABCC8,ABCG2,ABI1,ABI2,...,ZFYVE28,ZFYVE9,ZMIZ1,ZMIZ2,ZNF14,ZNF274,ZNF318,ZNF385A,ZNFX1,ZYX
COSMIC ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
683667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
684052,0,-1,0,0,0,0,-1,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
684057,0,0,-1,0,0,0,-1,0,-1,0,...,0,0,-1,0,0,0,0,0,0,0
684059,0,0,0,0,-1,0,-1,-1,0,0,...,0,0,-1,0,0,0,0,0,0,2
684062,0,0,0,0,0,0,0,0,-1,0,...,1,0,-1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1660034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1660035,0,0,0,0,0,0,-1,-1,0,0,...,-1,-1,0,2,-1,-1,-1,0,1,0
1660036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1674021,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


### 7. Keep only the common cell lines across the data

In [25]:
common_genes = list(genes.intersection(filtered_cnv_matrix.columns))
common_cell_lines = list(set(filtered_rna_matrix.index).intersection(filtered_mutation_matrix.index, 
                                                                     drug.index.get_level_values(level=0).unique()))
filtered_cnv_matrix=filtered_cnv_matrix.loc[filtered_cnv_matrix.index.isin(common_cell_lines),  common_genes]
filtered_cnv_matrix

Unnamed: 0_level_0,ADCY3,MCM3,RET,COL4A1,ESR2,CLSPN,ARHGEF6,CD72,LMNB1,TACC3,...,NME1,GUCA1B,NCOA2,IKZF1,PTGS2,DNMT1,TERT,CTBP1,DLGAP5,ARFIP2
COSMIC ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
683667,0,0,0,-1,-1,0,1.0,0,0,0,...,0,0,1,0,0,0,0,0,-1,0
684057,0,0,-1,0,-1,0,0.0,0,-1,0,...,0,0,1,0,0,0,-1,0,-1,-1
684059,0,0,0,-1,-1,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,-1,-1
684062,0,0,-1,0,-1,0,0.0,-1,-1,1,...,0,0,0,0,1,0,0,1,-1,0
687457,0,0,0,0,0,0,0.0,-1,0,0,...,1,0,0,1,0,-1,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659929,0,0,0,1,0,0,0.0,0,0,-1,...,0,0,0,1,1,0,0,-1,0,0
1660034,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
1660036,0,0,0,0,1,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1674021,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [36]:
filtered_cnv_matrix = filtered_cnv_matrix.loc[:, filtered_cnv_matrix.notna().all()]
filtered_cnv_matrix.to_csv("processed_data/gdsc_sanger_pid_cnv.csv")
filtered_cnv_matrix

Unnamed: 0_level_0,ADCY3,MCM3,RET,COL4A1,ESR2,CLSPN,CD72,LMNB1,TACC3,PINX1,...,NME1,GUCA1B,NCOA2,IKZF1,PTGS2,DNMT1,TERT,CTBP1,DLGAP5,ARFIP2
COSMIC ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
683667,0,0,0,-1,-1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,-1,0
684057,0,0,-1,0,-1,0,0,-1,0,0,...,0,0,1,0,0,0,-1,0,-1,-1
684059,0,0,0,-1,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1,-1
684062,0,0,-1,0,-1,0,-1,-1,1,0,...,0,0,0,0,1,0,0,1,-1,0
687457,0,0,0,0,0,0,-1,0,0,0,...,1,0,0,1,0,-1,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659929,0,0,0,1,0,0,0,0,-1,-1,...,0,0,0,1,1,0,0,-1,0,0
1660034,0,0,0,0,0,0,0,0,0,-1,...,0,0,0,0,2,0,0,0,0,0
1660036,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1674021,0,0,0,0,0,0,0,0,0,-1,...,0,0,0,1,0,0,0,0,0,0


In [None]:
# SGA

In [6]:
import pandas as pd

meth = pd.read_csv("data/GSE68379_Matrix.processed.txt", sep="\t").set_index("Row.names")
meth

Unnamed: 0_level_0,697_AVG.Beta,5637_AVG.Beta,201T_AVG.Beta,22RV1_AVG.Beta,23132-87_AVG.Beta,42-MG-BA_AVG.Beta,451LU_AVG.Beta,639-V_AVG.Beta,647-V_AVG.Beta,769-P_AVG.Beta,...,WM35_Detection.PVal,WM793B_Detection.PVal,WSU-DLCL2_Detection.PVal,WSU-NHL_Detection.PVal,YAPC_Detection.PVal,YH-13_Detection.PVal,YKG-1_Detection.PVal,YMB-1-E_Detection.PVal,YT_Detection.PVal,ZR-75-30_Detection.PVal
Row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg00000029,0.932624,0.083843,0.165861,0.474541,0.602180,0.868057,0.077183,0.880555,0.142020,0.944973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00000108,0.957445,0.951069,0.877158,0.892458,0.850957,0.947157,0.920661,0.948498,0.926183,0.945954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00000109,0.940921,0.896189,0.930599,0.893547,0.636248,0.956781,0.903080,0.919102,0.892319,0.824355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00000165,0.605907,0.871076,0.927126,0.929684,0.247847,0.639991,0.845964,0.928224,0.923477,0.752018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00000236,0.923124,0.912586,0.924128,0.915189,0.705308,0.946950,0.904795,0.912532,0.886618,0.897428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch.X.97129969R,0.003185,0.002225,0.003068,0.004725,0.000000,0.000000,0.014648,0.002644,0.010674,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ch.X.97133160R,0.084363,0.085214,0.037856,0.075484,0.057862,0.004221,0.077491,0.086416,0.074184,0.006611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ch.X.97651759F,0.017804,0.033283,0.012950,0.039007,0.027410,0.001103,0.029381,0.030095,0.034745,0.019336,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ch.X.97737721F,0.068601,0.099142,0.085089,0.095121,0.076480,0.083535,0.073061,0.096925,0.103987,0.056279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
meth = meth[meth.columns[meth.columns.isin(cell_line_mapping["Cell line Name"].astype(str).apply(lambda x: f"{x}_AVG.Beta"))]]
meth.columns = meth.columns.str.rsplit("_").str[0]
meth = meth.T.merge(cell_line_mapping[["Cell line Name", "COSMIC ID"]], 
             left_index=True, 
             right_on="Cell line Name").drop(columns=["Cell line Name"]).set_index("COSMIC ID")
meth

Unnamed: 0_level_0,cg00000029,cg00000108,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000622,...,ch.X.93511680F,ch.X.938089F,ch.X.94051109R,ch.X.94260649R,ch.X.967194F,ch.X.97129969R,ch.X.97133160R,ch.X.97651759F,ch.X.97737721F,ch.X.98007042R
COSMIC ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
906800,0.932624,0.957445,0.940921,0.605907,0.923124,0.801539,0.919870,0.781122,0.902605,0.016276,...,0.068155,0.013701,0.026792,0.000000,0.181964,0.003185,0.084363,0.017804,0.068601,0.021967
687452,0.083843,0.951069,0.896189,0.871076,0.912586,0.708303,0.833588,0.796317,0.118050,0.030790,...,0.068440,0.027901,0.024921,0.057790,0.223385,0.002225,0.085214,0.033283,0.099142,0.086238
924100,0.474541,0.892458,0.893547,0.929684,0.915189,0.643394,0.900730,0.126305,0.864758,0.016086,...,0.072081,0.013403,0.024808,0.053465,0.236406,0.004725,0.075484,0.039007,0.095121,0.061598
910924,0.602180,0.850957,0.636248,0.247847,0.705308,0.571740,0.420353,0.455155,0.635447,0.003560,...,0.066002,0.041129,0.019055,0.077608,0.148007,0.000000,0.057862,0.027410,0.076480,0.080878
687561,0.868057,0.947157,0.956781,0.639991,0.946950,0.729727,0.743970,0.773927,0.925026,0.018058,...,0.055563,0.000000,0.003958,0.000000,0.142726,0.000000,0.004221,0.001103,0.083535,0.000407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909905,0.155335,0.690755,0.903297,0.856450,0.769804,0.491451,0.605751,0.824993,0.233814,0.013590,...,0.064068,0.025352,0.023649,0.036447,0.098432,0.043824,0.048529,0.017651,0.094203,0.026130
687592,0.661915,0.901116,0.855727,0.690125,0.899756,0.714082,0.904636,0.855955,0.394032,0.013210,...,0.066738,0.024176,0.019734,0.017009,0.111271,0.002285,0.053751,0.028191,0.083133,0.072359
1303911,0.085776,0.929182,0.915536,0.773326,0.918321,0.744969,0.892948,0.821035,0.868872,0.011616,...,0.057371,0.024669,0.015738,0.036917,0.165817,0.000000,0.045260,0.040193,0.086328,0.051170
946358,0.912337,0.958475,0.825600,0.726782,0.921308,0.727773,0.901747,0.832462,0.908935,0.013104,...,0.056594,0.018776,0.018207,0.000000,0.192028,0.018809,0.000000,0.026001,0.075740,0.060762


In [38]:
meth.T.merge(cell_line_mapping[["Cell line Name", "COSMIC ID"]], 
             left_index=True, 
             right_on="Cell line Name").drop(columns=["Cell line Name"]).set_index("COSMIC ID")

Unnamed: 0_level_0,cg00000029,cg00000108,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000622
COSMIC ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
906800,0.932624,0.957445,0.940921,0.605907,0.923124,0.801539,0.919870,0.781122,0.902605,0.016276
687452,0.083843,0.951069,0.896189,0.871076,0.912586,0.708303,0.833588,0.796317,0.118050,0.030790
924100,0.474541,0.892458,0.893547,0.929684,0.915189,0.643394,0.900730,0.126305,0.864758,0.016086
910924,0.602180,0.850957,0.636248,0.247847,0.705308,0.571740,0.420353,0.455155,0.635447,0.003560
687561,0.868057,0.947157,0.956781,0.639991,0.946950,0.729727,0.743970,0.773927,0.925026,0.018058
...,...,...,...,...,...,...,...,...,...,...
909905,0.155335,0.690755,0.903297,0.856450,0.769804,0.491451,0.605751,0.824993,0.233814,0.013590
687592,0.661915,0.901116,0.855727,0.690125,0.899756,0.714082,0.904636,0.855955,0.394032,0.013210
1303911,0.085776,0.929182,0.915536,0.773326,0.918321,0.744969,0.892948,0.821035,0.868872,0.011616
946358,0.912337,0.958475,0.825600,0.726782,0.921308,0.727773,0.901747,0.832462,0.908935,0.013104
