# Prototype queries: Compounds of [element], inorganic

Try a few different methods.

This is an experiment to determine what kind of query will give the most meaningful results. We are interested in:

- How many compounds are returned?
- What kinds of compounds? Do they match our idea of the definition of the group?

Therefore, for the purpose of this experiment, we only retrieve CIDs and output HTML summaries with graphics from PubChem.


## Setup

In [1]:
import os
import sys
import pandas as pd
from pandas import DataFrame

import rdkit
from rdkit import Chem, rdBase
from rdkit.Chem import AllChem, Draw, rdqueries, rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole

import sqlalchemy
from sqlalchemy import create_engine, Table, MetaData
from sqlalchemy.sql import select, text, and_, or_, not_

sys.path.append('../..')
from camelid.env import CamelidEnv
from camelid.cmgroup import CMGroup
from camelid.query import get_query_results, substructure_query, substruct_exclude_query
from camelid.hypertext import cids_to_html

In [2]:
env = CamelidEnv('test')  # For output file management

# Database connection & metadata
con = create_engine('postgresql://akokai@localhost/chmdata')
meta = MetaData(conn)
cpds = Table('cpds', meta, autoload=True)
# Remember molecule column, to help keep query-generating code concise:
mol = cpds.c.molecule

2017-03-17 17:20:11,659 camelid.env INFO Project path: /opt/akokai/data/camelid/test


  (attype, name))


### Set of elements of interest

In [3]:
elems_inorg = [
    'As',
    'Cd',
    'Pb',
    'Sb',
    'Ni',
    'Au',
    'Be',
    'Rh',
    'Se',
    'Sn',
    'V',
    'U',
]

### Store results...

In [4]:
cmgs = []

## SMARTS substructure [element], with SQL clause excluding [organics]

The question is **how to specify what "organic" patterns to exclude.** Try a number of different exclude patterns:

In [6]:
exclude_patterns = {
    'three_c': Chem.MolFromSmarts('[C,c].[C,c].[C,c]'),
    'two_c': Chem.MolFromSmarts('[C,c].[C,c]'),
    'ch_bonds': Chem.MolFromSmarts('[C!H0,c!H0]'),
    'carbon': Chem.MolFromSmarts('[C,c]')
}

### Execute SQL queries

In [3]:
def result_cids(df):
    cids = df['cid'].dropna()
    return cids

In [7]:
for pat in exclude_patterns.keys():

    for elem in elems_inorg:
        
        elem_smarts = '[{}]'.format(elem)
        query = substruct_exclude_query(elem_smarts, exclude_patterns[pat], mol, con)
        
        # Minimally use CMGroup to organize data

        id_ = elem + '_{}.html'.format(pat)
        sql_ = str(que.compile(compile_kwargs={'literal_binds': True}))
        params = {'cmg_id': id_,
          'structure_type': 'SMARTS',
          'name': '{0} compounds, inorganic (excluding {1})'.format(elem, pat)
         }
        cmg = CMGroup(params, env)

        # Do the query
        result = get_query_results(query)

        # Create summary of results...
        cids = result_cids(result)

        # Add results to our collection for JSON export
        results[elem][pat] = summ

        'notes' = '{0} results, {1} CIDs from: {2}'.format(len(result), summ['n_cids'])

        # Output HTML
        html_file = os.path.join(cmg.results_path, id_)
        cids_to_html(summ['cids'],
                     html_file,
                     title=params['name'],
                     notes=notes)

NameError: name 'cids' is not defined

## Single-clause SMARTS substructure

Don't know how to specify SMARTS for "contains this element and not *any* carbon *anywhere".

Instead experimenting with identifying "inorganic" forms of carbon (carbonate, CO, CN...).

In [None]:

def get_smarts_results(elem, pattern, mol, conn):
    smarts = pattern.format(elem)
    que = smarts_query(qmol, mol, [cpds])
    res = conn.execute(que)
    cids = [row[0] for row in res.fetchall() if row[0] is not None]
    ret = {'sql': str(que.compile(compile_kwargs={'literal_binds': True})),
           'rowcount': res.rowcount,
           'cids': cids,
           'n_cids': len(cids)}
    return ret



smarts_strings = {
    'inorg_c': '[{0};!$([{0}]-[C,c])].[CH0;!$(C~C[H])]'
}


for exp in smarts_strings.keys():
    for elem in elems_inorg:
        res = get_smarts_results(elem, smarts_strings[exp], mol, conn, exp)
        
        # Add results to our collection
        results[elem][exp] = res

        # Output HTML
        title = '{0} compounds, inorganic: SMARTS {1}'.format(elem, exp)
        notes = '{0} results, {1} CIDs: '.format(res['rowcount'], len(cids))
        notes += res['sql']
        html_file = os.path.join(env.results_path, elem + '_{}.html'.format(exp))
        cids_to_html(res['cids'], html_file, title=title, notes=notes)

## Dump all search results to JSON for further inspection

In [None]:
import json

with open(os.path.join(env.results_path, 'inorganic.json'), 'w') as fp:
    json.dump(results, fp)