# Transform IUPAC names into numerical data 

In [1]:
# Upload the json file with iupac names 
import pandas as pd
import json

with open('dataset_with_iupacs_II.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [2]:
# Convert json file to a data frame 
# df_pubChemComputed_target_0 = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')
df_cids = pd.DataFrame.from_dict(pd.json_normalize(data))

# Display the data frame
print('Shape of the data frame: ', df_cids.shape)
df_cids.head()

Shape of the data frame:  (101876, 38)


Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,gpidcnt,gpfamilycnt,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,meshheadings,annotation
0,1294294,N-[5-benzoyl-2-(4-morpholinyl)phenyl]-2-furamide,[N-[5-benzoyl-2-(4-morpholinyl)phenyl]-2-furam...,376.4,C22H20N2O4,71.8,543.0,3.2,28,1,...,0,0,"[Biological Test Results, Classification, Lite...",5,"[360, 361, 368, 373, 374, 375, 411, 422, 425, ...",2005-07-10,"[ABI Chem, Chem-Space.com Database, ChEMBL, Ch...","[Chemical Vendors, Curation Efforts, Governmen...",,
1,807804,N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ace...,[N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ac...,264.68,C13H10ClFN2O,42.0,282.0,2.7,18,1,...,1,1,"[Biological Test Results, Chemical and Physica...",6,"[1490, 1511, 1527, 1529, 1530, 1531, 1554, 155...",2005-07-08,"[ABI Chem, AKos Consulting & Solutions, ASINEX...","[Chemical Vendors, Curation Efforts, Governmen...",,
2,671673,2-[(1-phenyl-1H-tetrazol-5-yl)sulfanyl]acetohy...,"[MLS000712782, CHEMBL1580689, HMS2774F15, STK3...",250.28,C9H10N6OS,124.0,260.0,0.7,17,2,...,0,0,"[Biological Test Results, Chemical and Physica...",6,"[485, 781, 782, 798, 800, 873, 880, 940, 1021,...",2005-07-07,"[ABI Chem, AKos Consulting & Solutions, Ambint...","[Chemical Vendors, Curation Efforts, Governmen...",,
3,16017316,N'-{2-[4-(2-Fluorophenyl)piperazin-1-YL]-2-(py...,[N'-{2-[4-(2-FLUOROPHENYL)PIPERAZIN-1-YL]-2-(P...,489.6,C28H32FN5O2,77.6,719.0,4.1,36,2,...,0,0,"[Biological Test Results, Classification, Lite...",4,"[445, 902, 903, 904, 924, 1460, 1463, 1468, 14...",2007-04-02,"[AKos Consulting & Solutions, Aurora Fine Chem...","[Chemical Vendors, Curation Efforts, Governmen...",,
4,2161711,6-oxo-N-[2-(4-sulfamoylphenyl)ethyl]-1H-pyridi...,"[MLS001176200, HMS2904D23, AKOS007992980, SMR0...",321.35,C14H15N3O4S,127.0,596.0,-0.3,22,3,...,0,0,"[Biological Test Results, Chemical and Physica...",4,"[1490, 1511, 1527, 1529, 1530, 1531, 1554, 155...",2005-07-14,"[ABI Chem, AKos Consulting & Solutions, Ambint...","[Chemical Vendors, Governmental Organizations,...",,


In [3]:
# In order to avoid the warning 'A value is trying to be set on a copy of a slice from a DataFrame'
data = data.copy()

# Select the necessary features 
df_cids = df_cids[['cid', 'iupacname']]
df_cids.shape

(101876, 2)

In [4]:
df_cids = df_cids.drop_duplicates(subset='cid', keep=False)
df_cids.shape

(101876, 2)

In [5]:
df_cids = df_cids[df_cids['iupacname'].notna()]
# df = df.dropna()
df_cids['iupacname'] = df_cids['iupacname'].astype('string') 
df_cids.dtypes

cid                  object
iupacname    string[python]
dtype: object

In [6]:
# Rename column 'cid'
df_cids = df_cids.rename(index=str, columns={'cid':'CID', 
                                             'iupacname':'UPAC'})
df_cids.head()

Unnamed: 0,CID,UPAC
0,1294294,N-(5-benzoyl-2-morpholin-4-ylphenyl)furan-2-ca...
1,807804,N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ace...
2,671673,2-(1-phenyltetrazol-5-yl)sulfanylacetohydrazide
3,16017316,N-[2-[4-(2-fluorophenyl)piperazin-1-yl]-2-pyri...
4,2161711,6-oxo-N-[2-(4-sulfamoylphenyl)ethyl]-1H-pyridi...


In [7]:
# Obtain listo fo UPAC formulas provided in PubChem
UPAC_list = df_cids['UPAC'].to_list()

for item in UPAC_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

N-(5-benzoyl-2-morpholin-4-ylphenyl)furan-2-carboxamide
N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)acetamide
2-(1-phenyltetrazol-5-yl)sulfanylacetohydrazide
N-[2-[4-(2-fluorophenyl)piperazin-1-yl]-2-pyridin-3-ylethyl]-N'-(2,4,6-trimethylphenyl)oxamide
6-oxo-N-[2-(4-sulfamoylphenyl)ethyl]-1H-pyridine-3-carboxamide


In [8]:
from string import digits

# Remove elements from UPAC formulas
dataset = []
for i in UPAC_list:
    # Replace simbols with a coma
    new_list = i.translate(str.maketrans({'[': ' , ', ']': ' , ', 
                                         '(': ' , ', ')' : ' , ',
                                         '-': ' , ', ';' : ' , '}))
    
    new_str = new_list.translate(str.maketrans('', '', digits))
    dataset.append(new_str)
dataset

for item in dataset[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

N ,  ,  , benzoyl ,  , morpholin ,  , ylphenyl , furan ,  , carboxamide
N ,  ,  , chloropyridin ,  , yl ,  ,  ,  ,  , fluorophenyl , acetamide
 ,  ,  , phenyltetrazol ,  , yl , sulfanylacetohydrazide
N ,  ,  ,  ,  ,  ,  , fluorophenyl , piperazin ,  , yl ,  ,  , pyridin ,  , ylethyl ,  , N' ,  , ,, , trimethylphenyl , oxamide
 , oxo , N ,  ,  ,  ,  , sulfamoylphenyl , ethyl ,  , H , pyridine ,  , carboxamide


In [9]:
# Convert the strings into list of strings
new_strings = []
for i in dataset:
    parsed_string = i.split(",")
    new_strings.append(parsed_string)
new_strings

for item in new_strings[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

['N ', '  ', '  ', ' benzoyl ', '  ', ' morpholin ', '  ', ' ylphenyl ', ' furan ', '  ', ' carboxamide']
['N ', '  ', '  ', ' chloropyridin ', '  ', ' yl ', '  ', '  ', '  ', '  ', ' fluorophenyl ', ' acetamide']
[' ', '  ', '  ', ' phenyltetrazol ', '  ', ' yl ', ' sulfanylacetohydrazide']
['N ', '  ', '  ', '  ', '  ', '  ', '  ', ' fluorophenyl ', ' piperazin ', '  ', ' yl ', '  ', '  ', ' pyridin ', '  ', ' ylethyl ', '  ', " N' ", '  ', ' ', '', ' ', ' trimethylphenyl ', ' oxamide']
[' ', ' oxo ', ' N ', '  ', '  ', '  ', '  ', ' sulfamoylphenyl ', ' ethyl ', '  ', ' H ', ' pyridine ', '  ', ' carboxamide']


In [10]:
from functools import reduce
# Convert list of lists in one list 
new_list = reduce(lambda a, b: a + b, new_strings)

for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

N 
  
  
 benzoyl 
  


In [11]:
# # Remove empty strings
# new_list = list(filter(None, new_list))
# new_list

In [12]:
new_list = [s.replace(' ', '') for s in new_list]
for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

N


benzoyl



In [13]:
# Print the muber of left strings after reduction
print (len (new_list))

1764167


In [14]:
# remove duplicates
from collections import OrderedDict

new_list = list(OrderedDict.fromkeys(new_list)) 

for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

N

benzoyl
morpholin
ylphenyl


In [15]:
# Remove one letter strings
new_list_columns = [x for x in new_list if len(x) >= 4]

for item in new_list_columns[:5]:   # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

benzoyl
morpholin
ylphenyl
furan
carboxamide


In [16]:
print (len (new_list_columns))

5962


In [17]:
new_list_columns.sort()
new_list_columns

['acenaphthylene',
 'acenaphthyleno',
 'acetaldehyde',
 'acetamide',
 'acetamido',
 'acetamidoacetyl',
 'acetamidoadamantane',
 'acetamidoanilino',
 'acetamidobenzenesulfonate',
 'acetamidobenzoate',
 'acetamidobenzoyl',
 'acetamidobutanedioicacid',
 'acetamidocarbamothioyl',
 'acetamidoethyl',
 'acetamidomethyl',
 'acetamidophenoxy',
 'acetamidophenyl',
 'acetamidopiperidin',
 'acetamidopropanoate',
 'acetamidopyridin',
 'acetate',
 'aceticacid',
 'acetohydrazide',
 'acetonitrile',
 'acetyl',
 'acetylanilino',
 'acetylbenzimidazol',
 'acetylcarbamothioylamino',
 'acetylchromen',
 'acetylhydrazinyl',
 'acetylhydrazinylidene',
 'acetylindol',
 'acetylnaphthalen',
 'acetyloxy',
 'acetyloxybenzoyl',
 'acetyloxyethyl',
 'acetyloxymethoxy',
 'acetyloxymethyl',
 'acetyloxyoct',
 'acetyloxyphenyl',
 'acetyloxyprop',
 'acetyloxypropylamino',
 'acetyloxyquinolin',
 'acetylphenanthren',
 'acetylphenoxy',
 'acetylphenyl',
 'acetylpiperazin',
 'acetylpiperazine',
 'acetylpiperidin',
 'acetylpyridi

In [18]:
import numpy as np

dict_func_groups = dict.fromkeys(new_list_columns, 0)
dict_func_groups

{'acenaphthylene': 0,
 'acenaphthyleno': 0,
 'acetaldehyde': 0,
 'acetamide': 0,
 'acetamido': 0,
 'acetamidoacetyl': 0,
 'acetamidoadamantane': 0,
 'acetamidoanilino': 0,
 'acetamidobenzenesulfonate': 0,
 'acetamidobenzoate': 0,
 'acetamidobenzoyl': 0,
 'acetamidobutanedioicacid': 0,
 'acetamidocarbamothioyl': 0,
 'acetamidoethyl': 0,
 'acetamidomethyl': 0,
 'acetamidophenoxy': 0,
 'acetamidophenyl': 0,
 'acetamidopiperidin': 0,
 'acetamidopropanoate': 0,
 'acetamidopyridin': 0,
 'acetate': 0,
 'aceticacid': 0,
 'acetohydrazide': 0,
 'acetonitrile': 0,
 'acetyl': 0,
 'acetylanilino': 0,
 'acetylbenzimidazol': 0,
 'acetylcarbamothioylamino': 0,
 'acetylchromen': 0,
 'acetylhydrazinyl': 0,
 'acetylhydrazinylidene': 0,
 'acetylindol': 0,
 'acetylnaphthalen': 0,
 'acetyloxy': 0,
 'acetyloxybenzoyl': 0,
 'acetyloxyethyl': 0,
 'acetyloxymethoxy': 0,
 'acetyloxymethyl': 0,
 'acetyloxyoct': 0,
 'acetyloxyphenyl': 0,
 'acetyloxyprop': 0,
 'acetyloxypropylamino': 0,
 'acetyloxyquinolin': 0,
 'a

In [19]:
# # check if a key is in dict
# 'acetamide' in dict_func_groups.keys()

In [20]:
# https://stackoverflow.com/questions/28056171/how-to-build-and-fill-pandas-dataframe-from-for-loop
import copy
import re
dataset = []
data = []
# new_strings = []

list_formulas = df_cids['UPAC'].to_list()
for formula in list_formulas:
    new_list = re.findall(r'\b\w{4,}\b', formula)
    for group in new_list:
        # working
        new_dict = copy.deepcopy(dict_func_groups)
        for ii in new_list:
            if ii in new_dict.keys():
                new_dict[ii] +=1
                    
    id = formula
    # Combine all new features in an array
    # arr =[id, new_dict.values()]
    dataset.append(new_dict.values())

In [21]:
df_id = pd.DataFrame(list_formulas)

df_id  = df_id .rename(index=str, columns={0:'UPAC'})

# Create an "index" column with the corresponding index values
df_id['index'] = df_id.index

# Convert the index values into strings
df_id['index'] = df_id['index'].astype(str)

# Display df_similarity
print('Shape of df_id: ', df_id.shape)
df_id.head() 

Shape of df_id:  (101858, 2)


Unnamed: 0,UPAC,index
0,N-(5-benzoyl-2-morpholin-4-ylphenyl)furan-2-ca...,0
1,N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ace...,1
2,2-(1-phenyltetrazol-5-yl)sulfanylacetohydrazide,2
3,N-[2-[4-(2-fluorophenyl)piperazin-1-yl]-2-pyri...,3
4,6-oxo-N-[2-(4-sulfamoylphenyl)ethyl]-1H-pyridi...,4


In [22]:
df_func_groups = pd.DataFrame.from_dict(dataset,
                                        orient='columns',
                                        dtype=None, columns=None) 


df_func_groups.columns = new_list_columns
df_func_groups.head()

Unnamed: 0,acenaphthylene,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,...,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanylpyrimidin,ynylsulfanyltetrazole,ynylthieno,yohimban,zinc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Create an "index" column with the corresponding index values
df_func_groups['index'] = df_func_groups.index

# Convert the index values into strings
df_func_groups['index'] = df_func_groups['index'].astype(str)

# Display df_similarity
print('Shape of df_func_groups: ', df_func_groups.shape)
df_func_groups.head() 

Shape of df_func_groups:  (101858, 5963)


Unnamed: 0,acenaphthylene,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,...,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanylpyrimidin,ynylsulfanyltetrazole,ynylthieno,yohimban,zinc,index
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [24]:
# Add the new data frame df_similarity to the already existing df based on "index"
df = pd.merge(df_id,df_func_groups, on=['index'])

# Display df_similarity
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (101858, 5964)


Unnamed: 0,UPAC,index,acenaphthylene,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,...,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanylpyrimidin,ynylsulfanyltetrazole,ynylthieno,yohimban,zinc
0,N-(5-benzoyl-2-morpholin-4-ylphenyl)furan-2-ca...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ace...,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2-(1-phenyltetrazol-5-yl)sulfanylacetohydrazide,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,N-[2-[4-(2-fluorophenyl)piperazin-1-yl]-2-pyri...,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6-oxo-N-[2-(4-sulfamoylphenyl)ethyl]-1H-pyridi...,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df.drop('index', axis=1, inplace=True)
df.head()

Unnamed: 0,UPAC,acenaphthylene,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,...,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanylpyrimidin,ynylsulfanyltetrazole,ynylthieno,yohimban,zinc
0,N-(5-benzoyl-2-morpholin-4-ylphenyl)furan-2-ca...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ace...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2-(1-phenyltetrazol-5-yl)sulfanylacetohydrazide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,N-[2-[4-(2-fluorophenyl)piperazin-1-yl]-2-pyri...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6-oxo-N-[2-(4-sulfamoylphenyl)ethyl]-1H-pyridi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Merge the data frame with the targets 
df = pd.merge(df, df_cids, on='UPAC')

# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (101860, 5964)


Unnamed: 0,UPAC,acenaphthylene,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,...,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanylpyrimidin,ynylsulfanyltetrazole,ynylthieno,yohimban,zinc,CID
0,N-(5-benzoyl-2-morpholin-4-ylphenyl)furan-2-ca...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1294294
1,N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ace...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,807804
2,2-(1-phenyltetrazol-5-yl)sulfanylacetohydrazide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,671673
3,N-[2-[4-(2-fluorophenyl)piperazin-1-yl]-2-pyri...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,16017316
4,6-oxo-N-[2-(4-sulfamoylphenyl)ethyl]-1H-pyridi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2161711


In [27]:
df_targets = pd.read_csv('CIDs_targets_TDP1_II.csv', index_col=[0])
df_targets.head()

Unnamed: 0,CID,SID,target
0,1294294,4256765,1
1,807804,47201366,0
2,671673,24808112,0
3,16017316,24395212,1
4,2161711,49680337,0


In [28]:
print("Shape: ", df_targets.shape)

Shape:  (101876, 3)


In [29]:
df_targets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101876 entries, 0 to 101875
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   CID     101876 non-null  int64
 1   SID     101876 non-null  int64
 2   target  101876 non-null  int64
dtypes: int64(3)
memory usage: 3.1 MB


In [30]:
# Converting object data type to numeric data type
df['CID'] = pd.to_numeric(df['CID'])
df['CID'].dtypes

dtype('int64')

In [31]:
df_IUPAC_targets = pd.merge( df_targets, df, on='CID')

In [32]:
df_IUPAC_targets.head(2)

Unnamed: 0,CID,SID,target,UPAC,acenaphthylene,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,...,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanylpyrimidin,ynylsulfanyltetrazole,ynylthieno,yohimban,zinc
0,1294294,4256765,1,N-(5-benzoyl-2-morpholin-4-ylphenyl)furan-2-ca...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,807804,47201366,0,N-(5-chloropyridin-2-yl)-2-(4-fluorophenyl)ace...,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Drop the CID and UPAC columns by name
df_IUPAC_targets = df_IUPAC_targets.drop(columns=['CID', 'SID', 'UPAC']) 

In [34]:
df_IUPAC_targets.shape

(101860, 5963)

In [35]:
df_IUPAC_targets.to_csv('data_IUPACs_II.csv')