# Transform IUPAC names into numerical data 

In [1]:
# Upload the json file with iupac names 
import pandas as pd
import json

with open('dataset_with_iupacs.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [2]:
# Convert json file to a data frame 
# df_pubChemComputed_target_0 = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')
df_cids = pd.DataFrame.from_dict(pd.json_normalize(data))

# Display the data frame
print('Shape of the data frame: ', df_cids.shape)
df_cids.head()

Shape of the data frame:  (63796, 38)


Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,heavycnt,hbonddonor,hbondacc,...,gpfamilycnt,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,xlogp,annotation,meshheadings
0,17139068,"(9,10-Dihydro-9,10-ethanoanthracen-11-ylmethyl...","[6275-73-6, (9,10-dihydro-9,10-ethanoanthracen...",271.8,C17H18ClN,26.0,289.0,19,2,1,...,0,"[Biological Test Results, Chemical and Physica...",6,"[330, 1490, 1511, 1527, 1529, 1530, 1531, 1554...",2007-11-13,"[AA BLOCKS, Activate Scientific, AKos Consulti...","[Chemical Vendors, Curation Efforts, Governmen...",,,
1,16296802,"1-[(1,3-Dioxaindan-5-yl)carbamoyl]ethyl 6-chlo...","[877043-14-6, MLS002162625, CHEMBL1456787, HMS...",348.74,C16H13ClN2O5,86.8,477.0,24,1,6,...,0,"[Biological Test Results, Chemical and Physica...",5,"[1987, 1996, 2016, 2023, 2025, 2029, 2052, 205...",2007-07-30,"[AKos Consulting & Solutions, Ambinter, Aurora...","[Chemical Vendors, Curation Efforts, Governmen...",2.8,,
2,711442,N-[3-cyano-4-(2-furyl)-6-isobutyl-2-pyridinyl]...,"[MLS000061170, N-[3-cyano-4-(2-furyl)-6-isobut...",283.32,C16H17N3O2,78.9,415.0,21,1,4,...,0,"[Biological Test Results, Classification, Lite...",5,"[360, 361, 368, 373, 374, 411, 422, 425, 429, ...",2005-07-08,"[ABI Chem, AKos Consulting & Solutions, ASINEX...","[Chemical Vendors, Curation Efforts, Governmen...",3.0,,
3,4969037,"2-(N-(3,4-dichlorophenyl)sulfonyl-4-methoxyani...","[MLS002161524, CHEMBL1725134, HMS3023D12, AKOS...",507.4,C19H20Cl2N2O6S2,127.0,834.0,31,1,7,...,0,"[Biological Test Results, Classification, Lite...",4,"[2101, 2380, 2517, 2520, 2521, 2524, 2540, 254...",2005-09-17,"[ABI Chem, AKos Consulting & Solutions, Aurora...","[Chemical Vendors, Curation Efforts, Governmen...",2.8,,
4,2453689,[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate,"[[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate, 572...",229.23,C13H11NO3,59.2,284.0,17,1,3,...,0,"[Biological Test Results, Chemical and Physica...",6,"[485, 631, 731, 757, 758, 759, 760, 761, 764, ...",2005-07-15,"[ABI Chem, AKos Consulting & Solutions, Aurora...","[Chemical Vendors, Curation Efforts, Governmen...",2.3,,


In [3]:
# In order to avoid the warning 'A value is trying to be set on a copy of a slice from a DataFrame'
data = data.copy()

# Select the necessary features 
df_cids = df_cids[['cid', 'iupacname']]
df_cids.shape

(63796, 2)

In [4]:
df_cids = df_cids.drop_duplicates(subset='cid', keep=False)
df_cids.shape

(63796, 2)

In [5]:
df_cids = df_cids[df_cids['iupacname'].notna()]
# df = df.dropna()
df_cids['iupacname'] = df_cids['iupacname'].astype('string') 
df_cids.dtypes

cid                  object
iupacname    string[python]
dtype: object

In [6]:
# Rename column 'cid'
df_cids = df_cids.rename(index=str, columns={'cid':'CID', 
                                             'iupacname':'UPAC'})
df_cids.head()

Unnamed: 0,CID,UPAC
0,17139068,"15-tetracyclo[6.6.2.02,7.09,14]hexadeca-2,4,6,..."
1,16296802,"[1-(1,3-benzodioxol-5-ylamino)-1-oxopropan-2-y..."
2,711442,N-[3-cyano-4-(furan-2-yl)-6-(2-methylpropyl)py...
3,4969037,"2-(N-(3,4-dichlorophenyl)sulfonyl-4-methoxyani..."
4,2453689,[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate


In [7]:
# Obtain listo fo UPAC formulas provided in PubChem
UPAC_list = df_cids['UPAC'].to_list()

for item in UPAC_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

15-tetracyclo[6.6.2.02,7.09,14]hexadeca-2,4,6,9,11,13-hexaenylmethanamine;hydrochloride
[1-(1,3-benzodioxol-5-ylamino)-1-oxopropan-2-yl] 6-chloropyridine-3-carboxylate
N-[3-cyano-4-(furan-2-yl)-6-(2-methylpropyl)pyridin-2-yl]acetamide
2-(N-(3,4-dichlorophenyl)sulfonyl-4-methoxyanilino)-N-(1,1-dioxothiolan-3-yl)acetamide
[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate


In [8]:
from string import digits

# Remove elements from UPAC formulas
dataset = []
for i in UPAC_list:
    # Replace simbols with a coma
    new_list = i.translate(str.maketrans({'[': ' , ', ']': ' , ', 
                                         '(': ' , ', ')' : ' , ',
                                         '-': ' , ', ';' : ' , '}))
    
    new_str = new_list.translate(str.maketrans('', '', digits))
    dataset.append(new_str)
dataset

for item in dataset[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

 , tetracyclo , ...,., , hexadeca , ,,,,, , hexaenylmethanamine , hydrochloride
 ,  ,  , , , benzodioxol ,  , ylamino ,  ,  , oxopropan ,  , yl ,   , chloropyridine ,  , carboxylate
N ,  ,  , cyano ,  ,  , furan ,  , yl ,  ,  ,  ,  , methylpropyl , pyridin ,  , yl , acetamide
 ,  , N ,  , , , dichlorophenyl , sulfonyl ,  , methoxyanilino ,  , N ,  , , , dioxothiolan ,  , yl , acetamide
 ,  , oxo ,  ,  , H , pyrrol ,  , yl , ethyl ,  benzoate


In [9]:
# Convert the strings into list of strings
new_strings = []
for i in dataset:
    parsed_string = i.split(",")
    new_strings.append(parsed_string)
new_strings

for item in new_strings[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

[' ', ' tetracyclo ', ' ...', '.', ' ', ' hexadeca ', ' ', '', '', '', '', ' ', ' hexaenylmethanamine ', ' hydrochloride']
[' ', '  ', '  ', ' ', ' ', ' benzodioxol ', '  ', ' ylamino ', '  ', '  ', ' oxopropan ', '  ', ' yl ', '   ', ' chloropyridine ', '  ', ' carboxylate']
['N ', '  ', '  ', ' cyano ', '  ', '  ', ' furan ', '  ', ' yl ', '  ', '  ', '  ', '  ', ' methylpropyl ', ' pyridin ', '  ', ' yl ', ' acetamide']
[' ', '  ', ' N ', '  ', ' ', ' ', ' dichlorophenyl ', ' sulfonyl ', '  ', ' methoxyanilino ', '  ', ' N ', '  ', ' ', ' ', ' dioxothiolan ', '  ', ' yl ', ' acetamide']
[' ', '  ', ' oxo ', '  ', '  ', ' H ', ' pyrrol ', '  ', ' yl ', ' ethyl ', '  benzoate']


In [10]:
from functools import reduce
# Convert list of lists in one list 
new_list = reduce(lambda a, b: a + b, new_strings)

for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

 
 tetracyclo 
 ...
.
 


In [11]:
# # Remove empty strings
# new_list = list(filter(None, new_list))
# new_list

In [12]:
new_list = [s.replace(' ', '') for s in new_list]
for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)


tetracyclo
...
.



In [13]:
# Print the muber of left strings after reduction
print (len (new_list))

1006648


In [14]:
# remove duplicates
from collections import OrderedDict

new_list = list(OrderedDict.fromkeys(new_list)) 

for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)


tetracyclo
...
.
hexadeca


In [15]:
# Remove one letter strings
new_list_columns = [x for x in new_list if len(x) >= 4]

for item in new_list_columns[:5]:   # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

tetracyclo
hexadeca
hexaenylmethanamine
hydrochloride
benzodioxol


In [16]:
print (len (new_list_columns))

4926


In [17]:
new_list_columns.sort()
new_list_columns

['acetaldehyde',
 'acetamide',
 'acetamido',
 'acetamidoacetyl',
 'acetamidoanilino',
 'acetamidobenzenesulfonate',
 'acetamidobenzoate',
 'acetamidobenzoicacid',
 'acetamidobenzoyl',
 'acetamidobutanedioicacid',
 'acetamidocarbamothioyl',
 'acetamidoethyl',
 'acetamidomethyl',
 'acetamidophenoxy',
 'acetamidophenyl',
 'acetamidopiperidin',
 'acetamidopropanoate',
 'acetamidopyridin',
 'acetamidopyridine',
 'acetate',
 'aceticacid',
 'acetohydrazide',
 'acetonitrile',
 'acetyl',
 'acetylanilino',
 'acetylbenzimidazol',
 'acetylcarbamothioylamino',
 'acetylhydrazinyl',
 'acetylhydrazinylidene',
 'acetylindol',
 'acetyloxy',
 'acetyloxybenzoyl',
 'acetyloxyethyl',
 'acetyloxymethyl',
 'acetyloxyphenyl',
 'acetyloxyprop',
 'acetylphenoxy',
 'acetylphenyl',
 'acetylpiperazin',
 'acetylpiperazine',
 'acetylpiperidin',
 'acetylpyridin',
 'acetylsulfamoyl',
 'acetylthiophen',
 'acridin',
 'acridine',
 'adamantan',
 'adamantane',
 'adamantyl',
 'adamantylamino',
 'adamantylcarbamothioyl',
 'ad

In [18]:
import numpy as np

dict_func_groups = dict.fromkeys(new_list_columns, 0)
dict_func_groups

{'acetaldehyde': 0,
 'acetamide': 0,
 'acetamido': 0,
 'acetamidoacetyl': 0,
 'acetamidoanilino': 0,
 'acetamidobenzenesulfonate': 0,
 'acetamidobenzoate': 0,
 'acetamidobenzoicacid': 0,
 'acetamidobenzoyl': 0,
 'acetamidobutanedioicacid': 0,
 'acetamidocarbamothioyl': 0,
 'acetamidoethyl': 0,
 'acetamidomethyl': 0,
 'acetamidophenoxy': 0,
 'acetamidophenyl': 0,
 'acetamidopiperidin': 0,
 'acetamidopropanoate': 0,
 'acetamidopyridin': 0,
 'acetamidopyridine': 0,
 'acetate': 0,
 'aceticacid': 0,
 'acetohydrazide': 0,
 'acetonitrile': 0,
 'acetyl': 0,
 'acetylanilino': 0,
 'acetylbenzimidazol': 0,
 'acetylcarbamothioylamino': 0,
 'acetylhydrazinyl': 0,
 'acetylhydrazinylidene': 0,
 'acetylindol': 0,
 'acetyloxy': 0,
 'acetyloxybenzoyl': 0,
 'acetyloxyethyl': 0,
 'acetyloxymethyl': 0,
 'acetyloxyphenyl': 0,
 'acetyloxyprop': 0,
 'acetylphenoxy': 0,
 'acetylphenyl': 0,
 'acetylpiperazin': 0,
 'acetylpiperazine': 0,
 'acetylpiperidin': 0,
 'acetylpyridin': 0,
 'acetylsulfamoyl': 0,
 'acetyl

In [19]:
# # check if a key is in dict
# 'acetamide' in dict_func_groups.keys()

In [20]:
# https://stackoverflow.com/questions/28056171/how-to-build-and-fill-pandas-dataframe-from-for-loop
import copy
import re
dataset = []
data = []
# new_strings = []

list_formulas = df_cids['UPAC'].to_list()
for formula in list_formulas:
    new_list = re.findall(r'\b\w{4,}\b', formula)
    for group in new_list:
        # working
        new_dict = copy.deepcopy(dict_func_groups)
        for ii in new_list:
            if ii in new_dict.keys():
                new_dict[ii] +=1
                    
    id = formula
    # Combine all new features in an array
    # arr =[id, new_dict.values()]
    dataset.append(new_dict.values())

In [21]:
df_id = pd.DataFrame(list_formulas)

df_id  = df_id .rename(index=str, columns={0:'UPAC'})

# Create an "index" column with the corresponding index values
df_id['index'] = df_id.index

# Convert the index values into strings
df_id['index'] = df_id['index'].astype(str)

# Display df_similarity
print('Shape of df_id: ', df_id.shape)
df_id.head() 

Shape of df_id:  (63788, 2)


Unnamed: 0,UPAC,index
0,"15-tetracyclo[6.6.2.02,7.09,14]hexadeca-2,4,6,...",0
1,"[1-(1,3-benzodioxol-5-ylamino)-1-oxopropan-2-y...",1
2,N-[3-cyano-4-(furan-2-yl)-6-(2-methylpropyl)py...,2
3,"2-(N-(3,4-dichlorophenyl)sulfonyl-4-methoxyani...",3
4,[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate,4


In [22]:
df_func_groups = pd.DataFrame.from_dict(dataset,
                                        orient='columns',
                                        dtype=None, columns=None) 


df_func_groups.columns = new_list_columns
df_func_groups.head()

Unnamed: 0,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,acetamidobenzoicacid,acetamidobenzoyl,acetamidobutanedioicacid,...,ynylpyridine,ynylpyrido,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanyltetrazole,ynylthiophene,zinc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Create an "index" column with the corresponding index values
df_func_groups['index'] = df_func_groups.index

# Convert the index values into strings
df_func_groups['index'] = df_func_groups['index'].astype(str)

# Display df_similarity
print('Shape of df_func_groups: ', df_func_groups.shape)
df_func_groups.head() 

Shape of df_func_groups:  (63788, 4927)


Unnamed: 0,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,acetamidobenzoicacid,acetamidobenzoyl,acetamidobutanedioicacid,...,ynylpyrido,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanyltetrazole,ynylthiophene,zinc,index
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [24]:
# Add the new data frame df_similarity to the already existing df based on "index"
df = pd.merge(df_id,df_func_groups, on=['index'])

# Display df_similarity
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (63788, 4928)


Unnamed: 0,UPAC,index,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,acetamidobenzoicacid,...,ynylpyridine,ynylpyrido,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanyltetrazole,ynylthiophene,zinc
0,"15-tetracyclo[6.6.2.02,7.09,14]hexadeca-2,4,6,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[1-(1,3-benzodioxol-5-ylamino)-1-oxopropan-2-y...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,N-[3-cyano-4-(furan-2-yl)-6-(2-methylpropyl)py...,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"2-(N-(3,4-dichlorophenyl)sulfonyl-4-methoxyani...",3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df.drop('index', axis=1, inplace=True)
df.head()

Unnamed: 0,UPAC,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,acetamidobenzoicacid,acetamidobenzoyl,...,ynylpyridine,ynylpyrido,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanyltetrazole,ynylthiophene,zinc
0,"15-tetracyclo[6.6.2.02,7.09,14]hexadeca-2,4,6,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[1-(1,3-benzodioxol-5-ylamino)-1-oxopropan-2-y...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,N-[3-cyano-4-(furan-2-yl)-6-(2-methylpropyl)py...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"2-(N-(3,4-dichlorophenyl)sulfonyl-4-methoxyani...",0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Merge the data frame with the targets 
df = pd.merge(df, df_cids, on='UPAC')

# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (63792, 4928)


Unnamed: 0,UPAC,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,acetamidobenzoicacid,acetamidobenzoyl,...,ynylpyrido,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanyltetrazole,ynylthiophene,zinc,CID
0,"15-tetracyclo[6.6.2.02,7.09,14]hexadeca-2,4,6,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17139068
1,"[1-(1,3-benzodioxol-5-ylamino)-1-oxopropan-2-y...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,16296802
2,N-[3-cyano-4-(furan-2-yl)-6-(2-methylpropyl)py...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,711442
3,"2-(N-(3,4-dichlorophenyl)sulfonyl-4-methoxyani...",0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4969037
4,[2-oxo-2-(1H-pyrrol-2-yl)ethyl] benzoate,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2453689


In [28]:
df_targets = pd.read_csv('CIDs_targets_G9a.csv', index_col=[0])
df_targets.head()

Unnamed: 0,CID,SID,target
0,17139068,49676863,0
1,16296802,57267697,0
2,711442,4265447,1
3,4969037,57256093,1
4,2453689,24834248,0


In [29]:
print("Shape: ", df_targets.shape)

Shape:  (63796, 3)


In [30]:
df_targets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63796 entries, 0 to 63795
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   CID     63796 non-null  int64
 1   SID     63796 non-null  int64
 2   target  63796 non-null  int64
dtypes: int64(3)
memory usage: 1.9 MB


In [31]:
# Converting object data type to numeric data type
df['CID'] = pd.to_numeric(df['CID'])
df['CID'].dtypes

dtype('int64')

In [32]:
df_IUPAC_targets = pd.merge( df_targets, df, on='CID')

In [33]:
df_IUPAC_targets.head(2)

Unnamed: 0,CID,SID,target,UPAC,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoanilino,acetamidobenzenesulfonate,...,ynylpyridine,ynylpyrido,ynylquinolin,ynylspiro,ynylsulfanyl,ynylsulfanylpyridine,ynylsulfanylpyrido,ynylsulfanyltetrazole,ynylthiophene,zinc
0,17139068,49676863,0,"15-tetracyclo[6.6.2.02,7.09,14]hexadeca-2,4,6,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16296802,57267697,0,"[1-(1,3-benzodioxol-5-ylamino)-1-oxopropan-2-y...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Drop the CID and UPAC columns by name
df_IUPAC_targets = df_IUPAC_targets.drop(columns=['CID', 'SID', 'UPAC']) 

In [35]:
df_IUPAC_targets.shape

(63792, 4927)

In [36]:
df_IUPAC_targets.to_csv('data_IUPACs.csv')