In [1]:
import sys
from os.path import join as pjoin

import pandas as pd
from pandas import DataFrame
from sqlalchemy import create_engine, types
from sqlalchemy.sql import text

sys.path.append('..')
from camelid.run import CamelidEnv
from camelid.synutils import select_name

In [2]:
env = CamelidEnv(project='syn1701')
UNRES_IDS = pjoin(env.data_path, 'unresolved.tsv')
conn = create_engine('postgresql://akokai@localhost/cmlbase')

2017-01-13 19:04:00,104 camelid INFO Project path: /opt/akokai/data/camelid/syn1701


# Retrieve & attempt to select best synonyms for unresolved CASRN:CID mappings

These are all 1:n CASRN:CID mappings where the *n* CIDs had different structural skeletons according to the InChI base layer.

In [6]:
unres = pd.read_table(UNRES_IDS)
unres.columns = ['casrn', 'cid', 'name']
print('{0} CASRN:CID mappings for {1} unique CASRNs.'.format(len(unres), unres.casrn.nunique()))
unres.head(10)

1164 CASRN:CID mappings for 395 unique CASRNs.


Unnamed: 0,casrn,cid,name
0,10402-16-1,6433548,"(Z)-9-OCTADECENOIC ACID, COPPER SALT"
1,10402-16-1,22833310,"(Z)-9-OCTADECENOIC ACID, COPPER SALT"
2,104098-49-9,3083549,CADRE
3,1119-34-2,66250,L-(+)-ARGININE MONOHYDROCHLORIDE
4,117-92-0,5377942,"Quinolinium, 2-[2-[4-(dimethylamino)phenyl]eth..."
5,117-92-0,5702759,"Quinolinium, 2-[2-[4-(dimethylamino)phenyl]eth..."
6,1191-80-6,14495,mercury dioleate
7,1191-80-6,57346983,mercury dioleate
8,127-85-5,14700088,"Arsonic acid, (4-aminophenyl)-, monosodium salt"
9,127-85-5,23670523,"Arsonic acid, (4-aminophenyl)-, monosodium salt"


### Import the table of unresolved IDs to SQL database

In [7]:
dtypes = dict(zip(unres.columns, 3*[types.String]))
unres.to_sql('unres', conn, if_exists='replace', index=False, dtype=dtypes)

## Retrieve all PubChem synonyms matching unresolved CIDs

By means of a SQL query on the *very large* table of PubChem CID synonyms.

In [9]:
cmd = text('''
select unres.casrn, unres.cid, unres.name, pubchem_synonyms.synonym
from unres, pubchem_synonyms
where unres.cid = pubchem_synonyms.cid;'''
          )
res = conn.execute(cmd)
syns = DataFrame(res.fetchall(), columns=['casrn', 'cid', 'name', 'synonym'])
print(len(syns), 'synonyms in PubChem matching these CIDs')
syns.head(5)

31570 synonyms in PubChem matching these CIDs


Unnamed: 0,casrn,cid,name,synonym
0,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",62696-00-8
1,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",62695-99-2
2,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",554-42-7
3,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",3845-05-4
4,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",3829-87-6


## Discard non-human-readable synonyms

Filter out PubChem synonyms that are likely to be names using `camelid.synutils.select_name()`.

In [10]:
syns['isname'] = syns.synonym.apply(select_name)
syns = syns.ix[syns.isname]
print(len(syns), 'synonyms that might be names')
syns.drop('isname', axis=1, inplace=True)
syns.head(5)

16308 synonyms that might be names


Unnamed: 0,casrn,cid,name,synonym
12,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...","[(1,2-dicarboxyethyl)thio]gold disodium salt"
16,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",disodium thiomalato-S-aurate(I)
17,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",disodium thiomalato-S-gold(I)
18,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",Mercaptobutanedioic Acid Monogold(1+) Sodium Salt
20,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",sodium aurum(I) thiomalate


## Identify synonyms that appear to be consistent with the given name

One problem here is that we don't know which CID necessarily belongs to each CASRN, but we do know the name that (we think) belongs to that CASRN. And we know the synonyms associated with the CID. If we can find a match between our 'known' name and the synonyms for one CID (and perhpas not another), we might actually be making the correct CASRN:CID association as well as getting synonyms.

First, create a series that relates each unique CID to a list of its PubChem synonyms

In [11]:
listed = syns.groupby('cid')['synonym'].apply(lambda x: list (x))
listed.head()

cid
10103751       [Copper(II) thiocyanate, Copper Dithiocyanate]
10121702                 [(Cymene)ruthenium dichloride dimer]
10176839    [acetic acid; (5R,6S)-6-amino-2,2-dimethyl-1,3...
10176840    [(5R)-2,2-Dimethyl-6beta-amino-1,3-dioxepane-5...
101932      [1,1'-DIETHYL-2,2'-CYANINE IODIDE,99%, Quinoli...
Name: synonym, dtype: object

Now we can check if the given (CML) name is included in PubChem synonyns for any CID. For example:

In [12]:
'Butanedioic acid, mercapto-, monogold(1+) sodium salt' in listed.loc['22318']

True

### Do this for each CID

In [13]:
syns['name_consistent'] = syns.apply(lambda x: x['name'] in listed.loc[x['cid']], axis=1)
syns.head(5)

Unnamed: 0,casrn,cid,name,synonym,name_consistent
12,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...","[(1,2-dicarboxyethyl)thio]gold disodium salt",True
16,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",disodium thiomalato-S-aurate(I),True
17,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",disodium thiomalato-S-gold(I),True
18,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",Mercaptobutanedioic Acid Monogold(1+) Sodium Salt,True
20,12244-57-4,22318,"Butanedioic acid, mercapto-, monogold(1+) sodi...",sodium aurum(I) thiomalate,True


In [14]:
syns.name_consistent.value_counts()

False    12551
True      3757
Name: name_consistent, dtype: int64

In [15]:
print(syns.casrn.nunique(), 'total CASRNs in input list')
print(syns[syns.name_consistent].casrn.nunique(), 'total CASRNs with self-consistent names')

393 total CASRNs in input list
100 total CASRNs with self-consistent names


In [16]:
syns.to_excel(pjoin(env.results_path, 'unres2016_synonyms.xlsx'), index=False)