# Transform IUPAC names into numerical data 

In [1]:
# Upload the json file with iupac names 
import pandas as pd
import json

with open('dataset_with_iupacs.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [2]:
# Convert json file to a data frame 
# df_pubChemComputed_target_0 = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')
df_cids = pd.DataFrame.from_dict(pd.json_normalize(data))

# Display the data frame
print('Shape of the data frame: ', df_cids.shape)
df_cids.head()

Shape of the data frame:  (45772, 38)


Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,gpidcnt,gpfamilycnt,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation,meshheadings
0,5342432,(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)met...,"[BIM-0029819.P001, SR-01000218658, SR-01000218...",399.4,C18H13N3O6S,157.0,700.0,3.3,28,2,...,0,0,"[Biological Test Results, Biological Test Resu...",2,"[372, 425, 521, 583, 1010, 1240, 1992, 1259310...",2005-07-15,"[ABI Chem, Aurora Fine Chemicals LLC, Burnham ...","[Chemical Vendors, Governmental Organizations,...",,
1,1531854,Dimethyl 4-(2-ethoxyphenyl)-1-(4-fluorobenzyl)...,"[STK404123, AKOS001573258, SR-01000587726, SR-...",425.4,C24H24FNO5,65.099,664.0,3.9,31,0,...,0,0,"[Biological Test Results, Classification]",2,"[1259310, 1259374, 1259422, 1508602]",2005-07-11,"[ABI Chem, AKos Consulting & Solutions, Chem-S...","[Chemical Vendors, Legacy Depositors, NIH Init...",,
2,780553,6-chloro-2-oxo-2H-chromene-3-carboxamide,"[6-chloro-2-oxo-2H-chromene-3-carboxamide, BAS...",223.61,C10H6ClNO3,69.4,340.0,2.2,15,1,...,2,2,"[Biological Test Results, Classification, Pate...",3,"[568, 583, 598, 602, 618, 619, 620, 629, 630, ...",2005-07-08,"[ABI Chem, AHH Chemical co.,ltd, AKos Consulti...","[Chemical Vendors, Curation Efforts, Governmen...",,
3,734064,Benzyl-p-acetoxybenzoate,"[benzyl-p-acetoxybenzoate, SCHEMBL9155739, AKO...",270.28,C16H14O4,52.6,325.0,3.7,20,0,...,8,6,"[Biological Test Results, Literature, Patents,...",4,"[1949, 1259310, 1259374, 1259422, 1508602]",2005-07-08,"[ABI Chem, AKos Consulting & Solutions, Ambint...","[Chemical Vendors, Curation Efforts, Governmen...",,
4,5052640,"2,3-Diphenyl-6-(trifluoromethyl)quinoxaline","[2,3-diphenyl-6-(trifluoromethyl)quinoxaline, ...",350.3,C21H13F3N2,25.8,455.0,5.4,26,0,...,0,0,"[Biological Test Results, Classification, Pate...",3,"[492967, 1224903, 1224905, 1259310, 1259374, 1...",2005-09-18,"[ABI Chem, AKos Consulting & Solutions, Bradne...","[Chemical Vendors, Governmental Organizations,...",,


In [3]:
# In order to avoid the warning 'A value is trying to be set on a copy of a slice from a DataFrame'
data = data.copy()

# Select the necessary features 
df_cids = df_cids[['cid', 'iupacname']]
df_cids.shape

(45772, 2)

In [4]:
df_cids = df_cids.drop_duplicates(subset='cid', keep=False)
df_cids.shape

(45772, 2)

In [5]:
df_cids = df_cids[df_cids['iupacname'].notna()]
# df = df.dropna()
df_cids['iupacname'] = df_cids['iupacname'].astype('string') 
df_cids.dtypes

cid                  object
iupacname    string[python]
dtype: object

In [6]:
# Rename column 'cid'
df_cids = df_cids.rename(index=str, columns={'cid':'CID', 
                                             'iupacname':'UPAC'})
df_cids.head()

Unnamed: 0,CID,UPAC
0,5342432,(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)met...
1,1531854,dimethyl 4-(2-ethoxyphenyl)-1-[(4-fluorophenyl...
2,780553,6-chloro-2-oxochromene-3-carboxamide
3,734064,benzyl 4-acetyloxybenzoate
4,5052640,"2,3-diphenyl-6-(trifluoromethyl)quinoxaline"


In [7]:
# Obtain listo fo UPAC formulas provided in PubChem
UPAC_list = df_cids['UPAC'].to_list()

for item in UPAC_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)methylidene]-1-phenyl-2-sulfanylidene-1,3-diazinane-4,6-dione
dimethyl 4-(2-ethoxyphenyl)-1-[(4-fluorophenyl)methyl]-4H-pyridine-3,5-dicarboxylate
6-chloro-2-oxochromene-3-carboxamide
benzyl 4-acetyloxybenzoate
2,3-diphenyl-6-(trifluoromethyl)quinoxaline


In [8]:
from string import digits

# Remove elements from UPAC formulas
dataset = []
for i in UPAC_list:
    # Replace simbols with a coma
    new_list = i.translate(str.maketrans({'[': ' , ', ']': ' , ', 
                                         '(': ' , ', ')' : ' , ',
                                         '-': ' , ', ';' : ' , '}))
    
    new_str = new_list.translate(str.maketrans('', '', digits))
    dataset.append(new_str)
dataset

for item in dataset[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

 , E ,  ,  ,  ,  ,  , hydroxy ,  , methoxy ,  , nitrophenyl , methylidene ,  ,  , phenyl ,  , sulfanylidene , , , diazinane , , , dione
dimethyl  ,  ,  , ethoxyphenyl ,  ,  ,  ,  ,  , fluorophenyl , methyl ,  , H , pyridine , , , dicarboxylate
 , chloro ,  , oxochromene ,  , carboxamide
benzyl  , acetyloxybenzoate
, , diphenyl ,  ,  , trifluoromethyl , quinoxaline


In [9]:
# Convert the strings into list of strings
new_strings = []
for i in dataset:
    parsed_string = i.split(",")
    new_strings.append(parsed_string)
new_strings

for item in new_strings[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

[' ', ' E ', '  ', '  ', '  ', '  ', '  ', ' hydroxy ', '  ', ' methoxy ', '  ', ' nitrophenyl ', ' methylidene ', '  ', '  ', ' phenyl ', '  ', ' sulfanylidene ', ' ', ' ', ' diazinane ', ' ', ' ', ' dione']
['dimethyl  ', '  ', '  ', ' ethoxyphenyl ', '  ', '  ', '  ', '  ', '  ', ' fluorophenyl ', ' methyl ', '  ', ' H ', ' pyridine ', ' ', ' ', ' dicarboxylate']
[' ', ' chloro ', '  ', ' oxochromene ', '  ', ' carboxamide']
['benzyl  ', ' acetyloxybenzoate']
['', ' ', ' diphenyl ', '  ', '  ', ' trifluoromethyl ', ' quinoxaline']


In [10]:
from functools import reduce
# Convert list of lists in one list 
new_list = reduce(lambda a, b: a + b, new_strings)

for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

 
 E 
  
  
  


In [11]:
# # Remove empty strings
# new_list = list(filter(None, new_list))
# new_list

In [12]:
new_list = [s.replace(' ', '') for s in new_list]
for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)


E





In [13]:
# Print the muber of left strings after reduction
print (len (new_list))

810404


In [14]:
# remove duplicates
from collections import OrderedDict

new_list = list(OrderedDict.fromkeys(new_list)) 

for item in new_list[:5]:  # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)


E
hydroxy
methoxy
nitrophenyl


In [15]:
# Remove one letter strings
new_list_columns = [x for x in new_list if len(x) >= 4]

for item in new_list_columns[:5]:   # https://theprogrammingexpert.com/python-print-first-10-items-in-list/
    print(item)

hydroxy
methoxy
nitrophenyl
methylidene
phenyl


In [16]:
print (len (new_list_columns))

4322


In [17]:
new_list_columns.sort()
new_list_columns

['acenaphthyleno',
 'acetaldehyde',
 'acetamide',
 'acetamido',
 'acetamidoacetyl',
 'acetamidoadamantane',
 'acetamidoanilino',
 'acetamidobenzenesulfonate',
 'acetamidobenzoate',
 'acetamidobenzoyl',
 'acetamidoethyl',
 'acetamidoethylamino',
 'acetamidomethyl',
 'acetamidophenoxy',
 'acetamidophenyl',
 'acetamidopiperidin',
 'acetamidopropanoate',
 'acetamidopropanoyl',
 'acetamidopyrazol',
 'acetamidopyridin',
 'acetamidothiophen',
 'acetate',
 'aceticacid',
 'acetohydrazide',
 'acetonitrile',
 'acetyl',
 'acetylanilino',
 'acetylbenzimidazol',
 'acetylbenzoate',
 'acetylbenzoyl',
 'acetylcarbamothioylamino',
 'acetylimino',
 'acetylindol',
 'acetylnaphthalen',
 'acetyloxy',
 'acetyloxybenzoate',
 'acetyloxybenzoyl',
 'acetyloxyethyl',
 'acetyloxyimino',
 'acetyloxymethyl',
 'acetyloxyphenyl',
 'acetyloxypropyl',
 'acetyloxypropylamino',
 'acetylphenoxy',
 'acetylphenyl',
 'acetylpiperazin',
 'acetylpiperazine',
 'acetylpiperidin',
 'acetylpyrazole',
 'acetylpyridine',
 'acetylpyrr

In [18]:
import numpy as np

dict_func_groups = dict.fromkeys(new_list_columns, 0)
dict_func_groups

{'acenaphthyleno': 0,
 'acetaldehyde': 0,
 'acetamide': 0,
 'acetamido': 0,
 'acetamidoacetyl': 0,
 'acetamidoadamantane': 0,
 'acetamidoanilino': 0,
 'acetamidobenzenesulfonate': 0,
 'acetamidobenzoate': 0,
 'acetamidobenzoyl': 0,
 'acetamidoethyl': 0,
 'acetamidoethylamino': 0,
 'acetamidomethyl': 0,
 'acetamidophenoxy': 0,
 'acetamidophenyl': 0,
 'acetamidopiperidin': 0,
 'acetamidopropanoate': 0,
 'acetamidopropanoyl': 0,
 'acetamidopyrazol': 0,
 'acetamidopyridin': 0,
 'acetamidothiophen': 0,
 'acetate': 0,
 'aceticacid': 0,
 'acetohydrazide': 0,
 'acetonitrile': 0,
 'acetyl': 0,
 'acetylanilino': 0,
 'acetylbenzimidazol': 0,
 'acetylbenzoate': 0,
 'acetylbenzoyl': 0,
 'acetylcarbamothioylamino': 0,
 'acetylimino': 0,
 'acetylindol': 0,
 'acetylnaphthalen': 0,
 'acetyloxy': 0,
 'acetyloxybenzoate': 0,
 'acetyloxybenzoyl': 0,
 'acetyloxyethyl': 0,
 'acetyloxyimino': 0,
 'acetyloxymethyl': 0,
 'acetyloxyphenyl': 0,
 'acetyloxypropyl': 0,
 'acetyloxypropylamino': 0,
 'acetylphenoxy':

In [19]:
# # check if a key is in dict
# 'acetamide' in dict_func_groups.keys()

In [20]:
# https://stackoverflow.com/questions/28056171/how-to-build-and-fill-pandas-dataframe-from-for-loop
import copy
import re
dataset = []
data = []
# new_strings = []

list_formulas = df_cids['UPAC'].to_list()
for formula in list_formulas:
    new_list = re.findall(r'\b\w{4,}\b', formula)
    for group in new_list:
        # working
        new_dict = copy.deepcopy(dict_func_groups)
        for ii in new_list:
            if ii in new_dict.keys():
                new_dict[ii] +=1
                    
    id = formula
    # Combine all new features in an array
    # arr =[id, new_dict.values()]
    dataset.append(new_dict.values())

In [21]:
df_id = pd.DataFrame(list_formulas)

df_id  = df_id .rename(index=str, columns={0:'UPAC'})

# Create an "index" column with the corresponding index values
df_id['index'] = df_id.index

# Convert the index values into strings
df_id['index'] = df_id['index'].astype(str)

# Display df_similarity
print('Shape of df_id: ', df_id.shape)
df_id.head() 

Shape of df_id:  (45764, 2)


Unnamed: 0,UPAC,index
0,(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)met...,0
1,dimethyl 4-(2-ethoxyphenyl)-1-[(4-fluorophenyl...,1
2,6-chloro-2-oxochromene-3-carboxamide,2
3,benzyl 4-acetyloxybenzoate,3
4,"2,3-diphenyl-6-(trifluoromethyl)quinoxaline",4


In [22]:
df_func_groups = pd.DataFrame.from_dict(dataset,
                                        orient='columns',
                                        dtype=None, columns=None) 


df_func_groups.columns = new_list_columns
df_func_groups.head()

Unnamed: 0,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,acetamidobenzoyl,...,ynylimidazo,ynylindol,ynylpiperidine,ynylpropanamide,ynylpyrido,ynylpyrimido,ynylsulfanyl,ynylsulfanylbenzimidazol,ynylthieno,zinc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Create an "index" column with the corresponding index values
df_func_groups['index'] = df_func_groups.index

# Convert the index values into strings
df_func_groups['index'] = df_func_groups['index'].astype(str)

# Display df_similarity
print('Shape of df_func_groups: ', df_func_groups.shape)
df_func_groups.head() 

Shape of df_func_groups:  (45764, 4323)


Unnamed: 0,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,acetamidobenzoyl,...,ynylindol,ynylpiperidine,ynylpropanamide,ynylpyrido,ynylpyrimido,ynylsulfanyl,ynylsulfanylbenzimidazol,ynylthieno,zinc,index
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [24]:
# Add the new data frame df_similarity to the already existing df based on "index"
df = pd.merge(df_id,df_func_groups, on=['index'])

# Display df_similarity
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (45764, 4324)


Unnamed: 0,UPAC,index,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,...,ynylimidazo,ynylindol,ynylpiperidine,ynylpropanamide,ynylpyrido,ynylpyrimido,ynylsulfanyl,ynylsulfanylbenzimidazol,ynylthieno,zinc
0,(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)met...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,dimethyl 4-(2-ethoxyphenyl)-1-[(4-fluorophenyl...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6-chloro-2-oxochromene-3-carboxamide,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,benzyl 4-acetyloxybenzoate,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"2,3-diphenyl-6-(trifluoromethyl)quinoxaline",4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df.drop('index', axis=1, inplace=True)
df.head()

Unnamed: 0,UPAC,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,...,ynylimidazo,ynylindol,ynylpiperidine,ynylpropanamide,ynylpyrido,ynylpyrimido,ynylsulfanyl,ynylsulfanylbenzimidazol,ynylthieno,zinc
0,(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)met...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,dimethyl 4-(2-ethoxyphenyl)-1-[(4-fluorophenyl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6-chloro-2-oxochromene-3-carboxamide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,benzyl 4-acetyloxybenzoate,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"2,3-diphenyl-6-(trifluoromethyl)quinoxaline",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Merge the data frame with the targets 
df = pd.merge(df, df_cids, on='UPAC')

# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (45766, 4324)


Unnamed: 0,UPAC,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,acetamidoanilino,acetamidobenzenesulfonate,acetamidobenzoate,...,ynylindol,ynylpiperidine,ynylpropanamide,ynylpyrido,ynylpyrimido,ynylsulfanyl,ynylsulfanylbenzimidazol,ynylthieno,zinc,CID
0,(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)met...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5342432
1,dimethyl 4-(2-ethoxyphenyl)-1-[(4-fluorophenyl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1531854
2,6-chloro-2-oxochromene-3-carboxamide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,780553
3,benzyl 4-acetyloxybenzoate,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,734064
4,"2,3-diphenyl-6-(trifluoromethyl)quinoxaline",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5052640


In [27]:
df_targets = pd.read_csv('CIDs_targets_GPR151.csv', index_col=[0])
df_targets.head()

Unnamed: 0,CID,SID,target
0,5342432,333096538,0
1,1531854,333461953,0
2,780553,333246479,0
3,734064,333159273,0
4,5052640,333429003,0


In [28]:
print("Shape: ", df_targets.shape)

Shape:  (45772, 3)


In [29]:
df_targets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45772 entries, 0 to 45771
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   CID     45772 non-null  int64
 1   SID     45772 non-null  int64
 2   target  45772 non-null  int64
dtypes: int64(3)
memory usage: 1.4 MB


In [30]:
# Converting object data type to numeric data type
df['CID'] = pd.to_numeric(df['CID'])
df['CID'].dtypes

dtype('int64')

In [31]:
df_IUPAC_targets = pd.merge( df_targets, df, on='CID')

In [32]:
df_IUPAC_targets.head(2)

Unnamed: 0,CID,SID,target,UPAC,acenaphthyleno,acetaldehyde,acetamide,acetamido,acetamidoacetyl,acetamidoadamantane,...,ynylimidazo,ynylindol,ynylpiperidine,ynylpropanamide,ynylpyrido,ynylpyrimido,ynylsulfanyl,ynylsulfanylbenzimidazol,ynylthieno,zinc
0,5342432,333096538,0,(5E)-5-[(4-hydroxy-3-methoxy-5-nitrophenyl)met...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1531854,333461953,0,dimethyl 4-(2-ethoxyphenyl)-1-[(4-fluorophenyl...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Drop the CID and UPAC columns by name
df_IUPAC_targets = df_IUPAC_targets.drop(columns=['CID', 'UPAC']) 

In [34]:
df_IUPAC_targets.shape

(45766, 4324)

In [35]:
df_IUPAC_targets.to_csv('data_IUPACs.csv')