# Drug data

In [1]:
import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
data_dir= '../../2023-2/DCC/'

In [3]:
dataset = 'DrugMAP_approved_smallmolecule_drug'
drugs_df=pd.read_csv(data_dir+dataset+'.csv')

In [4]:
print(len(drugs_df))
drugs_df.head()

1963


Unnamed: 0,DrugMAPID,DrugName,DrugMAPSMILES,rdkit_canonical_smiles
0,DMMHNU2,(S)-(+)-Dimethindene maleate,C[C@H](C1=CC=CC=N1)C2=C(CC3=CC=CC=C32)CCN(C)C....,CC(C1=C(CCN(C)C)Cc2ccccc21)c1ccccn1.O=C(O)C=CC...
1,DMIAHVU,2-deoxyglucose,C(C=O)[C@H]([C@@H]([C@@H](CO)O)O)O,O=CCC(O)C(O)C(O)CO
2,DMVZO01,99mTc-sestamibi,CC(C)(C[N+]#[C-])OC.CC(C)(C[N+]#[C-])OC.CC(C)(...,[C-]#[N+]CC(C)(C)OC.[C-]#[N+]CC(C)(C)OC.[C-]#[...
3,DMMN36E,Abacavir,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,Nc1nc(NC2CC2)c2ncn(C3C=CC(CO)C3)c2n1
4,DM2RX0I,Abametapir,CC1=CN=C(C=C1)C2=NC=C(C=C2)C,Cc1ccc(-c2ccc(C)cn2)nc1


In [5]:
smis = []
labels = []
drug_ids = []

failed=0
for drug_id, smi in zip(drugs_df.DrugMAPID, drugs_df.rdkit_canonical_smiles):
    mol=Chem.MolFromSmiles(smi)
    if mol:
        smis.append(smi)
        labels.append(1)
        drug_ids.append(drug_id)
    else:
        print(smi)
        failed+=1
failed



0

In [7]:
len(smis), len(labels), failed, len(drugs_df)

(1963, 1963, 0, 1963)

# Chemical Fingerprint

In [8]:
def rdkit_fingerprint(smi,radius=2, nbits=1024):
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    return fp.ToList()

In [9]:
fps = [rdkit_fingerprint(smi) for smi in smis]
fps = np.array(fps)
fps.shape



(1963, 1024)

In [10]:
drug_fp_df = pd.DataFrame(fps)
drug_fp_df = drug_fp_df.assign(label=labels)
drug_fp_df = drug_fp_df.assign(Drug_ID = drug_ids)
drug_fp_df.set_index('Drug_ID',inplace=True)
drug_fp_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
Drug_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DMMHNU2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
DMIAHVU,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
DMVZO01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
DMMN36E,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,1
DM2RX0I,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DM0DTF7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
DMPI6Z0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
DMRMCXW,0,1,0,0,1,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,1
DMF3VXA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


# Compound data

In [11]:
dataset = 'ZINC_compound'
compound_df=pd.read_csv(data_dir+dataset+'.csv')

In [12]:
print(len(compound_df))
compound_df.head()

1963


Unnamed: 0,ZINCID,ZINCSMILES,rdkit_canonical_smiles
0,1797335877,O=C(N[C@@H]1C[C@@H]2CC[C@H](C1)N2C(=O)c1ccc2cc...,O=C(NC1CC2CCC(C1)N2C(=O)c1ccc2cc[nH]c2c1)c1cc(...
1,2185232905,[N-]=[N+]=NCC1CCN(C(=O)C[C@@H]2C[C@@H]2C(F)(F)...,[N-]=[N+]=NCC1CCN(C(=O)CC2CC2C(F)(F)F)CC1
2,797719646,CCN(CC)c1ccc(CNC(=O)N(C2CCC2)[C@H](C)C(=O)O)cc1F,CCN(CC)c1ccc(CNC(=O)N(C2CCC2)C(C)C(=O)O)cc1F
3,734652015,CCN1CCN(C(=O)c2ccc(NC(=O)NCc3ccc(N(C)C)cc3C)cc...,CCN1CCN(C(=O)c2ccc(NC(=O)NCc3ccc(N(C)C)cc3C)cc...
4,1946955248,C[C@@H](NCc1ccc(O)c(O)c1)c1ccon1,CC(NCc1ccc(O)c(O)c1)c1ccon1


In [13]:
smis = []
labels = []
drug_ids = []

failed=0
for drug_id, smi in zip(compound_df.ZINCID, compound_df.rdkit_canonical_smiles):
    mol=Chem.MolFromSmiles(smi)
    if mol:
        smis.append(smi)
        labels.append(0)
        drug_ids.append(drug_id)
    else:
        print(smi)
        failed+=1
failed

0

In [15]:
len(smis), len(labels), failed, len(compound_df)

(1963, 1963, 0, 1963)

In [16]:
fps = [rdkit_fingerprint(smi) for smi in smis]
fps = np.array(fps)
fps.shape

(1963, 1024)

In [17]:
comp_fp_df = pd.DataFrame(fps)
comp_fp_df = comp_fp_df.assign(label=labels)
comp_fp_df = comp_fp_df.assign(Drug_ID = drug_ids)
comp_fp_df.set_index('Drug_ID',inplace=True)
comp_fp_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
Drug_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1797335877,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2185232905,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
797719646,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
734652015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1946955248,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338589002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
756709061,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
88737312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2042440049,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
whole_fp_df = pd.concat([drug_fp_df, comp_fp_df])

In [34]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(whole_fp_df, test_size=0.1, stratify=whole_fp_df['label'],
                              random_state=42)
train, valid = train_test_split(train, test_size=1/9, stratify=train['label'],
                              random_state=42)


In [36]:
# check if split based on label
train['label'].mean(), valid['label'].mean(), test['label'].mean()

(0.5, 0.5012722646310432, 0.49872773536895676)

In [40]:
len(train), len(valid), len(test)

(3140, 393, 393)

In [41]:
train.to_csv(f'../../2023-2/processed_data/ECFP/DCC_train_ECFP_R2B1024.csv')#,index=False)

In [42]:
valid.to_csv(f'../../2023-2/processed_data/ECFP/DCC_valid_ECFP_R2B1024.csv')#,index=False)

In [43]:
test.to_csv(f'../../2023-2/processed_data/ECFP/DCC_test_ECFP_R2B1024.csv')#,index=False)