# Create the csv files containing CID, SMILES notations and labels

In [1]:
import pandas as pd 

# load the BioAssay PubChem AID 686978 dataset
# https://pubchem.ncbi.nlm.nih.gov/bioassay/686978
df = pd.read_csv('AID_686978_data.csv')

# To avoid truncation of some columns during data frame display
pd.set_option('display.max_columns', None) 

# Display the data frame
print('Shape of the data frame: ', df.shape)

df.head()

Shape of the data frame:  (424003, 48)


Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Phenotype,Potency,Efficacy,Analysis Comment,Activity_Score,Curve_Description,Fit_LogAC50,Fit_HillSlope,Fit_R2,Fit_InfiniteActivity,Fit_ZeroActivity,Fit_CurveClass,Excluded_Points,Max_Response,Activity at 0.0000295000 uM,Activity at 0.0000590000 uM,Activity at 0.0001503265 uM,Activity at 0.0002712146 uM,Activity at 0.0005895491 uM,Activity at 0.00117 uM,Activity at 0.00179 uM,Activity at 0.00299 uM,Activity at 0.00672 uM,Activity at 0.014 uM,Activity at 0.026 uM,Activity at 0.040 uM,Activity at 0.074 uM,Activity at 0.167 uM,Activity at 0.363 uM,Activity at 0.628 uM,Activity at 0.975 uM,Activity at 1.849 uM,Activity at 4.119 uM,Activity at 9.037 uM,Activity at 15.83 uM,Activity at 21.08 uM,Activity at 46.23 uM,Activity at 92.54 uM,Activity at 165.6 uM,Compound QC;;;;
0,1,109967258,50897788.0,CNCC1=NC2=C(C=C(C=C2)Cl)C(=N1)C3=CC=CN3,Active,42,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,14.1254,136.547,,42.0,Partial curve; high efficacy,-4.85,1.01,0.9948,-134.488,2.0597,-2.1,0 0 0 0 0,-106.311,,,,,,,,,,2.0597,,,,,0.5007,,,-17.4373,,-48.5138,,,-106.311,,,QC'd by AA Pharmaceuticals;;;
1,2,144206324,65628.0,CN1C2=C(C=C(C=C2)N(CCCl)CCCl)N=C1CCCC(=O)O,Active,41,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,18.8375,82.1468,,41.0,Partial curve; high efficacy,-4.725,1.9887,0.9829,-82.6097,-0.4629,-2.1,0 0 0 0 0 0 0 0,-70.9545,,,,,-3.2819,,,1.1641,,5.8554,,,-1.8326,,-4.9853,,,0.0798,,-16.3921,,,-70.9545,,,QC'd by ACC;;;
2,3,144206325,14708.0,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C)O)CC[C@@...,Inconclusive,10,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,21.136,39.2903,,10.0,Single point of activity,-4.675,2.7868,0.9357,-43.4549,-4.1646,-3.0,0 0 0 0 0 0 0 0,-39.227,,,,,-4.14,,,-5.7369,,0.1681,,,-8.2838,,-6.9258,,,1.0188,,-7.8647,,,-39.227,,,QC'd by ACC;;;;
3,4,144206326,3085168.0,CC1=NC=C(C(=N1)N)CN(C=O)/C(=C(\CCO)/SS/C(=C(/C...,Inactive,0,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inactive,,,,0.0,,,4.9549,0.6177,2.0768,-13.6113,4.0,0 0 0 0 0 0 0 1,-5.7844,,,,,-11.9232,,,-14.5799,,8.2225,,,4.9756,,4.8996,,,-10.1081,,3.3805,,,-5.7844,,,QC'd by ACC;;;;
4,5,144206327,2449.0,CC1=C(C(=O)C(=C(C1=O)C)C(CCCCCC(=O)O)C2=CC=CC=...,Inconclusive,10,http://assay.nih.gov/htsws/rest/display/dd-tdp...,,Inhibitor,26.6086,77.9682,,10.0,Single point of activity,-4.575,3.5117,0.9609,-82.1111,-4.1429,-3.0,0 0 0 0 0 0 0 0,-72.3569,,,,,-10.6429,,,-2.5661,,-11.587,,,-4.3838,,1.2149,,,1.7567,,-5.8883,,,-72.3569,,,QC'd by ACC;;;;


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424003 entries, 0 to 424002
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   PUBCHEM_RESULT_TAG             424003 non-null  int64  
 1   PUBCHEM_SID                    424003 non-null  int64  
 2   PUBCHEM_CID                    423975 non-null  float64
 3   PUBCHEM_EXT_DATASOURCE_SMILES  423975 non-null  object 
 4   PUBCHEM_ACTIVITY_OUTCOME       424003 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE         424003 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL           424003 non-null  object 
 7   PUBCHEM_ASSAYDATA_COMMENT      0 non-null       float64
 8   Phenotype                      424003 non-null  object 
 9   Potency                        175189 non-null  float64
 10  Efficacy                       175189 non-null  float64
 11  Analysis Comment               0 non-null       float64
 12  Activity_Score                

In [3]:
# Remove duplicates in general
df.drop_duplicates()
df.shape  # 359,035 means no duplicates

(424003, 48)

In [4]:
# Remove duplicates in CIDs 
df = df.drop_duplicates(subset=['PUBCHEM_CID'], keep=False) 
df.shape  # 6,446 are duplicates 

(410564, 48)

In [5]:
# Remove the compounds missing CID-s
df = df[df['PUBCHEM_CID'].notna()]
df.shape

(410564, 48)

In [6]:
# Turn CID float data type into integer
df['PUBCHEM_CID'] = df['PUBCHEM_CID'].astype('int64') 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 410564 entries, 0 to 424002
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   PUBCHEM_RESULT_TAG             410564 non-null  int64  
 1   PUBCHEM_SID                    410564 non-null  int64  
 2   PUBCHEM_CID                    410564 non-null  int64  
 3   PUBCHEM_EXT_DATASOURCE_SMILES  410564 non-null  object 
 4   PUBCHEM_ACTIVITY_OUTCOME       410564 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE         410564 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL           410564 non-null  object 
 7   PUBCHEM_ASSAYDATA_COMMENT      0 non-null       float64
 8   Phenotype                      410564 non-null  object 
 9   Potency                        168779 non-null  float64
 10  Efficacy                       168779 non-null  float64
 11  Analysis Comment               0 non-null       float64
 12  Activity_Score                 4097

In [7]:
df_target = df [['PUBCHEM_CID', 'PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']]
df_target.head()

Unnamed: 0,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,50897788,CNCC1=NC2=C(C=C(C=C2)Cl)C(=N1)C3=CC=CN3,Active
1,65628,CN1C2=C(C=C(C=C2)N(CCCl)CCCl)N=C1CCCC(=O)O,Active
2,14708,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C)O)CC[C@@...,Inconclusive
3,3085168,CC1=NC=C(C(=N1)N)CN(C=O)/C(=C(\CCO)/SS/C(=C(/C...,Inactive
4,2449,CC1=C(C(=O)C(=C(C1=O)C)C(CCCCCC(=O)O)C2=CC=CC=...,Inconclusive


In [8]:
# Counting of values in column PUBCHEM_ACTIVITY_OUTCOME
df_target['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

PUBCHEM_ACTIVITY_OUTCOME
Inactive        236226
Inconclusive    112867
Active           61471
Name: count, dtype: int64

In [9]:
# Set the binary targets to "1" for an active inhibitors and "0" for others 
df_target = df_target.copy()

# Create a dictionary
target = {'Active' : 1,
          'Inactive' : 0,
         'Inconclusive':2}

# Replace the names with binary values set up in the dictionary targets
df_target.PUBCHEM_ACTIVITY_OUTCOME = [target[item] for item in df_target.PUBCHEM_ACTIVITY_OUTCOME]

# Rename "unique combination" into "target"
df_target = df_target.rename(columns={'PUBCHEM_CID':'CID', 
                                      'PUBCHEM_EXT_DATASOURCE_SMILES':'SMILES',
                                      'PUBCHEM_ACTIVITY_OUTCOME': 'target'})

# Check how many unique values there are in the 'target' column
df_target['target'].unique()

array([1, 2, 0], dtype=int64)

In [10]:
# Data frame only with label 1
df_label1 = df_target[df_target['target']==1]

In [11]:
# Data frame only with label 0
df_label0 = df_target[df_target['target']==0]

In [12]:
# Group the data frames  
df = pd.concat([df_label0, df_label1])

# Shuffle the resulting data sets
df = df.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Display the data frame
print('Shape of the data frame: ', df.shape)

df.head()

Shape of the data frame:  (297697, 3)


Unnamed: 0,CID,SMILES,target
0,7779314,C1CCCN(CC1)C(=O)C2=CC=C(C=C2)S(=O)(=O)NC3=CC=C...,0
1,973113,CC1CCN(CC1)CCNC(=S)NC2=CC=C(C=C2)OC,0
2,2351911,C1=CC(=CC=C1C(=O)NC2=CN=C3C=CC(=CN3C2=O)C(=O)O)F,0
3,16008588,CC1=CC(=C2C(=NN(C2=N1)C3=CC=CC=C3Cl)C)C(=O)NC4...,1
4,1245119,CC1=CC=C(C=C1)CN2CCN(CC2)C(=O)C3CCC3,0


In [13]:
df.to_csv('TDP1_full_dataset_for_CID_list.csv')

[<a href="#content">Back to top</a>]

## Create csv files <a name="3"></a>

In [14]:
# First part of df
df_1 = pd.read_csv('df1.csv', index_col=[0])
df1= pd.merge(df, df_1, on=['CID', 'SMILES'])
print(df1.shape)
df1.head()

(49999, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,7779314,C1CCCN(CC1)C(=O)C2=CC=C(C=C2)S(=O)(=O)NC3=CC=C...,0,4-(azepane-1-carbonyl)-N-(2-fluorophenyl)benze...
1,973113,CC1CCN(CC1)CCNC(=S)NC2=CC=C(C=C2)OC,0,1-(4-methoxyphenyl)-3-[2-(4-methylpiperidin-1-...
2,2351911,C1=CC(=CC=C1C(=O)NC2=CN=C3C=CC(=CN3C2=O)C(=O)O)F,0,"3-[(4-fluorobenzoyl)amino]-4-oxopyrido[1,2-a]p..."
3,16008588,CC1=CC(=C2C(=NN(C2=N1)C3=CC=CC=C3Cl)C)C(=O)NC4...,1,"1-(2-chlorophenyl)-N-(3,5-dimethoxyphenyl)-3,6..."
4,1245119,CC1=CC=C(C=C1)CN2CCN(CC2)C(=O)C3CCC3,0,cyclobutyl-[4-[(4-methylphenyl)methyl]piperazi...


In [15]:
# Remain only the necessary columns
df1 = df1[['UPAC', 'target']] 
print(df1.shape)
df1.head()

(49999, 2)


Unnamed: 0,UPAC,target
0,4-(azepane-1-carbonyl)-N-(2-fluorophenyl)benze...,0
1,1-(4-methoxyphenyl)-3-[2-(4-methylpiperidin-1-...,0
2,"3-[(4-fluorobenzoyl)amino]-4-oxopyrido[1,2-a]p...",0
3,"1-(2-chlorophenyl)-N-(3,5-dimethoxyphenyl)-3,6...",1
4,cyclobutyl-[4-[(4-methylphenyl)methyl]piperazi...,0


In [16]:
# Generate the csv file
df1.to_csv('df1_targets.csv')

In [17]:
# First part of df
df_2 = pd.read_csv('df2.csv', index_col=[0])
df2= pd.merge(df, df_2, on=['CID', 'SMILES'])
print(df2.shape)
df2.head()

(49997, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,16011674,CC1CN(C2=CC=CC=C2S1)C(=O)NC3=CC=C(C=C3)C(=O)NC...,1,N-[4-[2-(2-methoxyphenyl)ethylcarbamoyl]phenyl...
1,23724056,CCOC1=CC=C(C=C1)CN2CCN(CC2CCO)C3CCC3,0,2-[4-cyclobutyl-1-[(4-ethoxyphenyl)methyl]pipe...
2,1339266,CC1=CC=CC=C1N(CC(=O)NC2=CC=CC=C2C(=O)O)S(=O)(=O)C,0,2-[[2-(2-methyl-N-methylsulfonylanilino)acetyl...
3,6870031,C1=CC(=CC(=C1)O)/C=N/NC2=CC=NC(=O)N2,0,6-[(2E)-2-[(3-hydroxyphenyl)methylidene]hydraz...
4,3646028,C1=CC=C(C=C1)C2=NC3=C(C(=N2)NCC(=O)O)OC4=CC=CC...,0,"2-[(2-phenyl-[1]benzofuro[3,2-d]pyrimidin-4-yl..."


In [18]:
# Remain only the necessary columns
df2 = df2[['UPAC', 'target']] 
print(df2.shape)
df2.head()

(49997, 2)


Unnamed: 0,UPAC,target
0,N-[4-[2-(2-methoxyphenyl)ethylcarbamoyl]phenyl...,1
1,2-[4-cyclobutyl-1-[(4-ethoxyphenyl)methyl]pipe...,0
2,2-[[2-(2-methyl-N-methylsulfonylanilino)acetyl...,0
3,6-[(2E)-2-[(3-hydroxyphenyl)methylidene]hydraz...,0
4,"2-[(2-phenyl-[1]benzofuro[3,2-d]pyrimidin-4-yl...",0


In [19]:
# Generate the csv file
df2.to_csv('df2_targets.csv')

In [20]:
# Second part of df
df_3 = pd.read_csv('df3.csv', index_col=[0])
df3= pd.merge(df, df_3, on=['CID', 'SMILES'])
print(df3.shape)
df3.head()

(49997, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,3547487,C1CN(CCN1CC(=O)NC2=C(C=C(C=C2)F)F)C3=CC=CC=N3,0,"N-(2,4-difluorophenyl)-2-(4-pyridin-2-ylpipera..."
1,16024523,CC1=CC=C(C=C1)C(=O)C2=CN(C3=C(C2=O)C=CC(=N3)C)...,0,N-benzyl-2-[7-methyl-3-(4-methylbenzoyl)-4-oxo...
2,645742,CC1=CC(=NC(=N1)N2CCC(CC2)C(=O)NC3=CC4=C(C=C3)O...,1,"N-(1,3-benzodioxol-5-yl)-1-(4,6-dimethylpyrimi..."
3,2373834,C1CC1C(=O)NC(=S)NNC(=O)COC2=CC=C(C=C2)C#N,1,N-[[[2-(4-cyanophenoxy)acetyl]amino]carbamothi...
4,24980920,CC(=O)C(CC1=CC=CC=C1)NC(=O)COC(=O)CCN2C(=O)C3C...,0,[2-oxo-2-[(3-oxo-1-phenylbutan-2-yl)amino]ethy...


In [21]:
# Remain only the necessary columns
df3 = df3[['UPAC', 'target']] 
print(df3.shape)
df3.head()

(49997, 2)


Unnamed: 0,UPAC,target
0,"N-(2,4-difluorophenyl)-2-(4-pyridin-2-ylpipera...",0
1,N-benzyl-2-[7-methyl-3-(4-methylbenzoyl)-4-oxo...,0
2,"N-(1,3-benzodioxol-5-yl)-1-(4,6-dimethylpyrimi...",1
3,N-[[[2-(4-cyanophenoxy)acetyl]amino]carbamothi...,1
4,[2-oxo-2-[(3-oxo-1-phenylbutan-2-yl)amino]ethy...,0


In [22]:
# Generate the csv file
df3.to_csv('df3_targets.csv')

In [23]:
# First part of df
df_4 = pd.read_csv('df4.csv', index_col=[0])
df4= pd.merge(df, df_4, on=['CID', 'SMILES'])
print(df4.shape)
df4.head()

(49995, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,3226990,CC(=O)NC(CC1=CC(=C(C=C1)OC)OC)C(=O)N2CCN(CC2)C...,0,"4-[2-acetamido-3-(3,4-dimethoxyphenyl)propanoy..."
1,2612694,C1=CC=C(C(=C1)C2=NN=C(O2)CSC3=NN=C4N3C5=CC=CC=...,0,"2-(2-bromophenyl)-5-([1,2,4]triazolo[3,4-b][1,..."
2,16239899,CCN(CC)S(=O)(=O)C1=CC(=C(C=C1)N2CCOCC2)NC(=O)C...,1,N-[5-(diethylsulfamoyl)-2-morpholin-4-ylphenyl...
3,2362189,C1CCC(=CC1)CCNC(=O)COC(=O)C2=CC=CC=C2NC(=O)C3=...,0,[2-[2-(cyclohexen-1-yl)ethylamino]-2-oxoethyl]...
4,661391,C1CC1C(=O)C2=C(C3=CC=CC=C3O2)NC(=O)CSCCC(=O)O,0,3-[2-[[2-(cyclopropanecarbonyl)-1-benzofuran-3...


In [24]:
# Remain only the necessary columns
df4 = df4[['UPAC', 'target']] 
print(df4.shape)
df4.head()

(49995, 2)


Unnamed: 0,UPAC,target
0,"4-[2-acetamido-3-(3,4-dimethoxyphenyl)propanoy...",0
1,"2-(2-bromophenyl)-5-([1,2,4]triazolo[3,4-b][1,...",0
2,N-[5-(diethylsulfamoyl)-2-morpholin-4-ylphenyl...,1
3,[2-[2-(cyclohexen-1-yl)ethylamino]-2-oxoethyl]...,0
4,3-[2-[[2-(cyclopropanecarbonyl)-1-benzofuran-3...,0


In [25]:
# Generate the csv file
df4.to_csv('df4_targets.csv')

In [26]:
# Fifth part of df
df_5 = pd.read_csv('df5.csv', index_col=[0])
df5= pd.merge(df, df_5, on=['CID', 'SMILES'])
print(df5.shape)
df5.head()

(49997, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,2904821,CCOC(=O)C1CCCN(C1)C(=O)C2=C(ON=C2C3=CC=CC=C3Cl)C,0,"ethyl 1-[3-(2-chlorophenyl)-5-methyl-1,2-oxazo..."
1,135502031,C1=CC=C(C=C1)C2=NC3=C(C(=O)NC=N3)NC2=O,0,"7-phenyl-3,5-dihydropteridine-4,6-dione"
2,663728,CN1C2=CC=CC=C2N(C1=O)CC(=O)NCC3=CN=CC=C3,0,2-(3-methyl-2-oxobenzimidazol-1-yl)-N-(pyridin...
3,22330687,CC1=CC(=C(C=C1)OC)NC(=O)C2=C(C3=C(N=CN=C3S2)N4...,1,N-(2-methoxy-5-methylphenyl)-5-methyl-4-(4-pyr...
4,15945254,CC1=CC(=CC=C1)[N+]2=C(C=C(C3=C2NC(=O)NC3=O)C)C...,0,"5,7-dimethyl-8-(3-methylphenyl)-1H-pyrido[2,3-..."


In [27]:
# Remain only the necessary columns
df5 = df5[['UPAC', 'target']] 
print(df5.shape)
df5.head()

(49997, 2)


Unnamed: 0,UPAC,target
0,"ethyl 1-[3-(2-chlorophenyl)-5-methyl-1,2-oxazo...",0
1,"7-phenyl-3,5-dihydropteridine-4,6-dione",0
2,2-(3-methyl-2-oxobenzimidazol-1-yl)-N-(pyridin...,0
3,N-(2-methoxy-5-methylphenyl)-5-methyl-4-(4-pyr...,1
4,"5,7-dimethyl-8-(3-methylphenyl)-1H-pyrido[2,3-...",0


In [28]:
# Generate the csv file
df5.to_csv('df5_targets.csv')

In [29]:
# Sixth part of df
df_6 = pd.read_csv('df6.csv', index_col=[0])
df6= pd.merge(df, df_6, on=['CID', 'SMILES'])
print(df6.shape)
df6.head()

(47628, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,743014,C1CN(C2=CC=CC=C21)C(=O)CCCC(=O)N3CCC4=CC=CC=C43,0,"1,5-bis(2,3-dihydroindol-1-yl)pentane-1,5-dione"
1,135545532,CC1=C(C(=O)NC(=N1)N)CC(=O)OCCCl,0,2-chloroethyl 2-(2-amino-4-methyl-6-oxo-1H-pyr...
2,25163357,C1=CC=C(C=C1)C(=O)NC(=O)COC(=O)C2=C(C=C(C=C2)O)O,0,"(2-benzamido-2-oxoethyl) 2,4-dihydroxybenzoate"
3,20904678,CCC(C)N1C(=O)C2=C(C=CS2)N3C1=NN=C3CCCC(=O)N4CC...,0,"1-[4-(8-butan-2-yl-7-oxo-5-thia-1,8,10,11-tetr..."
4,8487516,C1=CC(=CC=C1CN2C=CN=C2SCC(=O)NC3=C(C=C(C=C3)F)F)F,0,"N-(2,4-difluorophenyl)-2-[1-[(4-fluorophenyl)m..."


In [30]:
# Remain only the necessary columns
df6 = df6[['UPAC', 'target']] 
print(df6.shape)
df6.head()

(47628, 2)


Unnamed: 0,UPAC,target
0,"1,5-bis(2,3-dihydroindol-1-yl)pentane-1,5-dione",0
1,2-chloroethyl 2-(2-amino-4-methyl-6-oxo-1H-pyr...,0
2,"(2-benzamido-2-oxoethyl) 2,4-dihydroxybenzoate",0
3,"1-[4-(8-butan-2-yl-7-oxo-5-thia-1,8,10,11-tetr...",0
4,"N-(2,4-difluorophenyl)-2-[1-[(4-fluorophenyl)m...",0


In [31]:
# Generate the csv file
df6.to_csv('df6_targets.csv')

[<a href="#content">Back to top</a>]