# Prototype queries: Compounds of [element], inorganic

Try two different methods:

1. SMARTS
2. SQL using two SMILES queries linked with `and not`

This is an experiment to determine what kind of query will give the most meaningful results. We are interested in:

- How many compounds are returned?
- What kinds of compounds? Do they match our idea of the definition of the group?

Therefore, for the purpose of this experiment, we only retriev CIDs and output HTML summaries with graphics from PubChem.


## Setup

In [None]:
import os
import sys
import pandas as pd
from pandas import DataFrame

from rdkit import Chem, rdBase
from rdkit.Chem import AllChem, Draw, rdqueries, rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole

import sqlalchemy
from sqlalchemy import create_engine, Table, MetaData
from sqlalchemy.sql import select, text, and_, or_, not_

sys.path.append('../..')  # to find camelid package
from camelid.env import CamelidEnv
from camelid.hypertext import cids_to_html

In [None]:
env = CamelidEnv('test')  # For output file management

# Database connection & metadata
conn = create_engine('postgresql://akokai@localhost/chmdata')
meta = MetaData(conn)
cpds = Table('cpds', meta, autoload=True)
# Remember molecule column, to help keep query-generating code concise:
mol = cpds.c.molecule

## Set of elements of interest

In [None]:
elems_inorg = [
    'As',
    'Cd',
    'Pb',
    'Sb',
    'Ni',
    'Au',
    'Be',
    'Rh',
    'Se',
    'Sn',
    'V',
    'U',
]

## Store results of all queries for later analysis

In [None]:
results = {elem: dict() for elem in elems_inorg}

# SMARTS way

## Define a SMARTS query constructor function

In [None]:
def smarts_query(elem, mol):
    # Note: not necessary to .bindparams() in this line -- can simply execute(que, s=elem)
    # -- but this helps generate string versions of the query for documentation.
    where_clause = mol.op('@>')(text(':s ::qmol').bindparams(s=elem))
    que = select([cpds.c.cid]).where(where_clause)
    return que

## Execute inorganics queries using SMARTS

In [None]:
for elem in elems_inorg:
    smarts = '[{}].[!C;!c]'.format(elem)
    que = smarts_query(smarts, mol)
    res = conn.execute(que)
    cids = [row[0] for row in res.fetchall() if row[0] is not None]

    # Add results to our collection...
    results[elem]['smarts'] = cids

    # Prepare HTML output
    title = '{} compounds, inorganic, SMARTS query'.format(elem)
    notes = '{0} results, {1} CIDs: '.format(res.rowcount, len(cids))
    notes += str(que.compile(compile_kwargs={'literal_binds': True}))
    html_file = os.path.join(env.results_path, elem + '_smarts.html')
    cids_to_html(cids, html_file, title=title, notes=notes)

# SQL way

Since this kind of query breaks the schema of "everything is a substructure search, just add SMILES/SMARTS", let's define a function to take care of the whole search operation.


## Define a function for specific kind of query

This will be a function specifically to **find all compounds containing an element, which do not also contain C.**

For simplicity, bundling up all the I/O operations together with the query. This is not how it should actually be implemented.

In [None]:
def get_element_cpds_no_carbon(elem, mol, conn):
    # if this were an actual function, mol and conn should be parameters
    # SQL: ... WHERE %s @> '[%e]' and not %s @> '[C]'
    smiles = '[{}]'.format(elem)
    que = select([cpds.c.cid])
    que = que.where(and_(mol.op('@>')(smiles),
                         not_(mol.op('@>')('[C]'))))
    res = conn.execute(que)
    cids = [row[0] for row in res.fetchall() if row[0] is not None]
    title = '{} compounds, inorganic, SQL query'.format(elem)
    notes = '{0} CIDs, {1} results from:\n'.format(len(cids),
                                                   res.rowcount)
    notes += str(que.compile(compile_kwargs={'literal_binds': True}))
    html_file = os.path.join(env.results_path, elem + '_sql.html')
    cids_to_html(cids, html_file, title=title, notes=notes)
    return cids

## Execute SQL queries

In [None]:
for elem in elems_inorg:
    cids = get_element_cpds_no_carbon(elem, mol, conn)
    results[elem]['sql'] = cids

# Dump all search results to JSON for further inspection

In [None]:
import json

with open(os.path.join(env.results_path, 'inorganic.json'), 'w') as fp:
    json.dump(results, fp)