# TDC data

In [1]:
import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

In [14]:
data_dir= '../../2023-2/TDC/'

In [126]:
dataset = 'Solubility_AqSolDB_test'
tdc_df=pd.read_csv(data_dir+dataset+'.csv',index_col=0)

In [127]:
print(len(tdc_df))
tdc_df.head()

1996


Unnamed: 0,Drug_ID,Drug,Y
0,"2-ethyl-9,10-dihydroanthracene-9,10-dione",CCc1ccc2c(c1)C(=O)c1ccccc1C2=O,-5.984242
1,Digoxin,CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC...,-4.16
2,diphenic acid,O=C(O)c1ccccc1-c1ccccc1C(=O)O,-2.2839
3,pentachlorobutadiene,ClC(Cl)=CC(Cl)=C(Cl)Cl,-4.23
4,trimethyl phosphate,COP(=O)(OC)OC,0.5526


In [128]:
smis = []
labels = []
drug_ids = []

failed=0
for drug_id, smi,label in zip(tdc_df.Drug_ID, tdc_df.Drug, tdc_df.Y):
    mol=Chem.MolFromSmiles(smi)
    if mol:
        smis.append(smi)
        labels.append(label)
        drug_ids.append(drug_id)
    else:
        print(smi)
        failed+=1
failed



0

In [129]:
len(smis), len(labels), failed, len(tdc_df)

(1996, 1996, 0, 1996)

# Chemical Fingerprint

In [130]:
def rdkit_fingerprint(smi,radius=2, nbits=1024):
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    return fp.ToList()

In [131]:
fps = [rdkit_fingerprint(smi) for smi in smis]
fps = np.array(fps)
fps.shape



(1996, 1024)

In [132]:
fp_df = pd.DataFrame(fps)
fp_df = fp_df.assign(label=labels)
fp_df = fp_df.assign(Drug_ID = drug_ids)
fp_df.set_index('Drug_ID',inplace=True)
fp_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
Drug_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"2-ethyl-9,10-dihydroanthracene-9,10-dione",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-5.984242
Digoxin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,-4.160000
diphenic acid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-2.283900
pentachlorobutadiene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-4.230000
trimethyl phosphate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.552600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4-vinylpyridine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.557900
"Benzoic acid, 2-hydroxy-5-[[4-[[4-[[8-hydroxy-7-[[4-[(8-hydroxy-3,6-disulfo-1-naphthalenyl)azo]-2-methoxy-5-methylphenyl]azo]-3,6-disulfo-1-naphthalenyl]amino]-6-(phenylamino)-1,3,5-triazin-2-yl]amino]phenyl]azo]-, pentasodium salt",1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,-2.483388
"(2E)-1-(2,6,6-trimethylcyclohex-3-en-1-yl)but-2-en-1-one",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,-3.396367
1-bromo-2-ethylbenzene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-3.670200


In [133]:
fp_df.to_csv(f'../../2023-2/processed_data/ECFP/{dataset}_ECFP_R2B1024.csv')#,index=False)