In [1]:
import os
import sys
sys.path.append('..')  # To locate camelid

from sqlalchemy import create_engine, types
from sqlalchemy.sql import text

from camelid.run import CamelidEnv

# Initialize database for linking chemical identifiers


## Environment and paths

**Make sure these correspond to the arrangement of your system.**

In [2]:
# We will generate output & logs, and look for other input files, 
# in the directories defined by this camelid environment. 
# See camelid docs for how CamelidEnv works.
env = CamelidEnv(project='cmlbase')

# File containing list of CASRNs + names to use as base library
CML_PATH = os.path.join(env.data_path, 'base_casrn.csv')

# Database connection. I use a separate database for ID wrangling operations.
conn = create_engine('postgresql://akokai@localhost/cmlbase')

2017-01-13 15:27:15,023 camelid INFO Project path: /opt/akokai/data/camelid/cmlbase


### Pre-processing of PubChem CID Synonyms

The `CID-Synonym-filtered` file has tab-separated values, but many of those values contain weird characters that SQL doesn't like. The fields are not quoted or escaped in any way. Therefore we have to do that ourselves. Run the `awk` script provided to create a new file `CID-Synonym-filtered-quoted`...

In [None]:
# !awk -f ../scripts/quotefields /PATH/TO/CID-Synonym-Filtered > /PATH/TO/CID-Synonym-filtered-quoted

Now tell us where that file is:

In [3]:
PUBCHEM_SYN_PATH = '/opt/akokai/data/PubChem/CID-Synonym-filtered-quoted'

## Create table of all CASRNs and names in base library

"Base library" refers to a dataset that you are trying to link with other datasets. Your own chemical database, for example.

In [None]:
# !psql cmlbase -c 'drop table cml;'

In [13]:
cmd = text('''
create table cml (casrn text, name text);
copy cid_syn from '{}'
    with (format csv, header, delimiter ',',
          quote '"', escape '\\');'''.format(CML_PATH)
          )
res = conn.execute(cmd)
# File has 45417 lines including header.
print(res.rowcount, 'rows created in table cid_syn (PubChem synonyms)')

45416 rows created in table cid_syn (PubChem synonyms)


Create table of CASRNs, Names, DTXSIDs from US EPA CompTox Chemistry Dashboard. ...

## Perform CASRN-CID mapping via PubChem identifier exchange

This operation is currently modeled in the notebook `CASRN-CID alignment workflow.ipynb`.
- **TODO:** Convert that notebook into noninteractive script.
- For now, use the output of that notebook as the source for a database table.

*Note:* To convert the CSV of CASRNs/names into a text file of non-quoted CASRNs only, run this command:

`awk -F ',' '{print $1}' PATH/TO/base_casrn_name.csv | sed 's/"//g' > PATH/TO/base_casrn`

## Create table of *all* PubChem CIDs and synonyms

In [None]:
# !psql cmlbase -c 'drop table cid_syn;'

In [8]:
cmd = text('''
create table pubchem_synonyms (cid text, synonym text);
copy cid_syn from '{}'
    with (format csv, header, delimiter '\t',
          quote '"', escape '"');'''.format(PUBCHEM_SYN_PATH)
          )
res = conn.execute(cmd)
# File has 142967517 lines including header.
print(res.rowcount, 'rows created in table pubchem_synonyms')

142967516 rows created in table pubchem_synonyms
