### Preprocessing the GDSC drug data
The notebook contains the necessary scripts to generate Morgan Fingerprint for Drugs, and creates a CSV file with cell line drug response alongwith the Morgan Fingerprint

In [1]:
import pandas as pd

In [5]:
gdsc_2 = pd.read_csv("data/gdsc_drug_data/gdsc_2_drugs.csv")
gdsc_2

Unnamed: 0,drug_id,drug_name,synonyms,pathway_name,targets,pubchem
0,1259,Talazoparib,"BMN-673, BMN 973",Genome integrity,"PARP1, PARP2",44819241
1,1372,Trametinib,"GSK1120212, Mekinist",ERK MAPK signaling,"MEK1, MEK2",11707110
2,1559,Luminespib,"AUY922, VER-52296,NVP-AUY922, AUY",Protein stability and degradation,HSP90,10096043
3,1615,CZC24832,GTPL6653,PI3K/MTOR signaling,PI3Kgamma,42623951
4,1620,PFI3,"PFI-3, PFI 3, AOB2221",Chromatin other,"Polybromo 1, SMARCA4, SMARCA2",78243717
...,...,...,...,...,...,...
292,2107,LJI308,-,PI3K/MTOR signaling,"RSK2, RSK1, RSK3",118704762
293,2156,5-azacytidine,-,Other,DNA methyltransferases,-
294,2362,THR-103,WIMM synthesis,PI3K/MTOR signaling,Mutant RAS,
295,1030,KU-55933,KU55933,Genome integrity,ATM,5278396


In [3]:
filtered=gdsc_2[gdsc_2["pubchem"].apply(lambda x: isinstance(x, str) and all(c.isdigit() for c in x))].set_index("drug_id").sort_index()
filtered

Unnamed: 0_level_0,drug_name,synonyms,pathway_name,targets,pubchem
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1003,Camptothecin,"Camptothecine, (+)-Camptothecin",DNA replication,TOP1,24360
1004,Vinblastine,Velban,Mitosis,Microtubule destabiliser,6710780
1005,Cisplatin,"cis-Diammineplatinum(II) dichloride, Platinol,...",DNA replication,DNA crosslinker,84691
1006,Cytarabine,"Ara-Cytidine, Arabinosyl Cytosine, U-19920",Other,Antimetabolite,6253
1007,Docetaxel,"RP-56976, Taxotere",Mitosis,Microtubule stabiliser,148124
...,...,...,...,...,...
2173,PFI-1,-,Chromatin other,BRD4,71271629
2174,IOX2,"IOX-2, IOX 2, AK176060",Other,EGLN1,54685215
2175,CHIR-99021,"CT 99021, CHIR99021, CHIR 99021",WNT signaling,"GSK3A, GSK3B",9956119
2177,SGC0946,-,Chromatin histone methylation,DOT1L,56962337


In [6]:
filtered[filtered.targets.isna()]

Unnamed: 0_level_0,drug_name,synonyms,pathway_name,targets,pubchem
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [5]:
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import AllChem


drug_bits = []
for pubchem_id in filtered["pubchem"].values:
    cmp=pcp.Compound.from_cid(pubchem_id)
    molecule = Chem.MolFromSmiles(cmp.isomeric_smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(molecule, radius=2, nBits=256)
    drug_bits.append(list(fingerprint))

drug_feature =pd.DataFrame(drug_bits, columns=(f"bit_{i}" for i in range(256)), index=filtered.index)
drug_feature

Unnamed: 0_level_0,bit_0,bit_1,bit_2,bit_3,bit_4,bit_5,bit_6,bit_7,bit_8,bit_9,...,bit_246,bit_247,bit_248,bit_249,bit_250,bit_251,bit_252,bit_253,bit_254,bit_255
drug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1004,1,0,0,0,0,0,0,0,1,1,...,0,1,0,1,0,1,0,0,0,0
1005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
1007,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2173,1,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
2174,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2177,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [47]:
drug_feature.to_csv("processed_data/gdsc2_drug_bits.csv")

In [3]:
gdsc2_ic50 = pd.read_excel("data/gdsc_drug_data/GDSC2_fitted_dose_response_27Oct23.xlsx")
gdsc2_ic50

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC2,343,15946310,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-1.463887,0.930220,0.089052,0.433123
1,GDSC2,343,15946548,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-4.869455,0.614970,0.111351,-1.421100
2,GDSC2,343,15946830,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.360586,0.791072,0.142855,-0.599569
3,GDSC2,343,15947087,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-5.044940,0.592660,0.135539,-1.516647
4,GDSC2,343,15947369,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.741991,0.734047,0.128059,-0.807232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242031,GDSC2,343,16188242,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.127082,0.976746,0.074498,0.156872
242032,GDSC2,343,16188695,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,8.576377,0.913378,0.057821,-1.626959
242033,GDSC2,343,16188953,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.519636,0.975001,0.058090,0.608442
242034,GDSC2,343,16189493,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.694579,0.969969,0.101013,0.809684


In [12]:
gdsc2_ic50.groupby("COSMIC_ID")["DRUG_ID"]

COSMIC_ID
971774      12
1290906     14
1297439     99
1240170     99
1659823    148
          ... 
906793     294
909776     294
905939     295
753608     295
905962     295
Name: DRUG_ID, Length: 969, dtype: int64

In [12]:
gdsc2_ic50.groupby("DRUG_ID").size()

DRUG_ID
1003    968
1004    741
1005    760
1006    743
1007    967
       ... 
2362    731
2438    732
2439    732
2498    735
2499    735
Length: 295, dtype: int64

In [10]:
gdsc2_ic50.describe()

Unnamed: 0,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,DRUG_ID,COMPANY_ID,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
count,242036.0,242036.0,242036.0,242036.0,242036.0,242036.0,242036.0,242036.0,242036.0,242036.0,242036.0
mean,343.0,16068060.0,992105.9,1594.042444,1042.966604,0.023143,23.462279,2.817079,0.882592,0.082779,7.312962e-10
std,0.0,70287.49,220981.9,398.740714,16.911327,0.158738,158.62281,2.762229,0.146998,0.042695,0.9993925
min,343.0,15946310.0,683667.0,1003.0,1001.0,1e-05,0.01,-8.747724,0.006282,0.003274,-8.254501
25%,343.0,16007190.0,906805.0,1149.0,1043.0,0.003002,3.0,1.508018,0.849449,0.051107,-0.6568485
50%,343.0,16068070.0,909720.0,1631.0,1046.0,0.010005,10.0,3.236731,0.944196,0.076083,0.01058
75%,343.0,16128930.0,1240144.0,1912.0,1046.0,0.010005,10.0,4.70011,0.974934,0.106105,0.6560362
max,343.0,16189780.0,1789883.0,2499.0,1101.0,2.001054,2000.0,13.820189,0.998904,0.299984,7.978776


In [12]:
print("""
Unique Cell Lines: {}
Unique Drugs: {}
""".format(gdsc2_ic50["COSMIC_ID"].nunique(), gdsc2_ic50["DRUG_ID"].nunique()))


Unique Cell Lines: 969
Unique Drugs: 295



In [14]:
merged=gdsc2_ic50[["COSMIC_ID", "LN_IC50", "DRUG_ID"]].merge(drug_feature, left_on="DRUG_ID", right_index=True)
merged.to_csv("processed_data/gdsc2_drug_ic50_feature.csv")
merged

Unnamed: 0,COSMIC_ID,LN_IC50,DRUG_ID,bit_0,bit_1,bit_2,bit_3,bit_4,bit_5,bit_6,...,bit_246,bit_247,bit_248,bit_249,bit_250,bit_251,bit_252,bit_253,bit_254,bit_255
0,683667,-1.463887,1003,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,684052,-4.869455,1003,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,684057,-3.360586,1003,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,684059,-5.044940,1003,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,684062,-3.741991,1003,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236904,1659928,5.409627,2359,0,1,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,1
236905,1660034,5.035265,2359,0,1,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,1
236906,1660035,6.119660,2359,0,1,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,1
236907,1674021,6.135335,2359,0,1,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,1
