# Update Notebook

This notebook contains the routines for updating our database. 

In [1]:
from pyArango.connection import *
from rdkit import Chem as chem
from rdkit.Chem import rdChemReactions as rdr
from prebchemdb.schema import ReactionAnnotation, Reaction, Molecule, MoleculeCollection
from prebchemdb.schema import ReactantLink, ProductLink, MoleculeLinkCollection
from prebchemdb.schema import ReactionCollection
from prebchemdb.schema import ReactionAnnotationLinkCollection, ReactionAnnotationLink
import pandas as pd
import hashlib
import base64
import json


In [2]:
conn = Connection(
    username="cuevaszuviri", password="fenzym-donna0-mYhsod"
)
db = conn["pORD"]

In [3]:
db.AQLQuery(
    """
    FOR u IN mainstage
        REMOVE u IN mainstage
    """, batchSize=1000, rawResults=True
)

db.AQLQuery(
    """
    FOR u IN mainstage_links
        REMOVE u IN mainstage_links
    """, batchSize=1000, rawResults=True
)

<pyArango.query.AQLQuery at 0x7fc5fec21010>

In [4]:
def get_mol_identifier(mol: chem.rdchem.Mol):
    inchikey = chem.MolToInchiKey(mol)
    if inchikey == "":
        raise RuntimeError("couldn't generated identifier - {:s}".format(chem.MolToSmiles(mol)))
    if mol.GetNumHeavyAtoms() > 6:
        return 'mol-l-{:s}'.format(inchikey[:-2])
    else:
        return 'mol-s-{:s}'.format(inchikey)
    
def get_rxn_identifier(reactant_identifiers, product_identifiers):
    smiles_resorted = ','.join(sorted(reactant_identifiers)) + ','.join(sorted(product_identifiers))
    hasher = hashlib.sha1(smiles_resorted.encode('utf-8')).digest()[:12]
    new_key = 'rxn-' + base64.urlsafe_b64encode(hasher).decode('utf-8')
    return new_key

In [5]:
def process_molecules(molecule_list, molecules_keys, molecules, comments):
    for mol in molecule_list:
        try:
            identifier = get_mol_identifier(mol)
        except:
            comments += "umable to convert {:s} to inchikey".format(chem.MolToSmiles(mol))
            continue
        if identifier in molecule_list:
            continue
        else:
            molecules.molecules.append(
                Molecule(
                    _key=str(identifier),
                    smiles=chem.MolToSmiles(mol),
                    inchi=chem.MolToInchi(mol),
                    inchikey=chem.MolToInchiKey(mol),
                )
            )
            molecules_keys.append(identifier)

In [17]:
def exclude_by_source(reaction: ReactionAnnotation):
    print(reaction.source)
    if reaction.source in [
        "10.1016/j.icarus.2016.04.027",
        "db:kegg",
        "10.1002/2016je005078",
        "10.1051/0004-6361/201220686",
        "10.1051/0004-6361/201936697",
    ]:
        return False
    else:
        return True

In [14]:
molecules = MoleculeCollection(molecules=[])
molecules_keys = []
links = MoleculeLinkCollection(reactants=[], products=[])
reaction_annotation_links = ReactionAnnotationLinkCollection(links=[])
reactions = ReactionCollection(reactions=[])

failures = []

for annotation in filter(exclude_by_source, map(lambda x: ReactionAnnotation(**x.getStore()), db['staging'].fetchAll())):
    try:
        rxn = rdr.ReactionFromSmarts(annotation.smiles, useSmiles=True)
    except ValueError:
        failures.append(annotation)
        continue

    reactants = list(rxn.GetReactants())
    products = list(rxn.GetProducts())
    comments = ""
    process_molecules(reactants + products, molecules_keys, molecules, comments)

    reactant_identifiers = []
    reactant_links = dict()
    product_identifiers = []
    product_links = dict()

    for mol in reactants:
        try:
            identifier = get_mol_identifier(mol)
        except RuntimeError:
            continue
        reactant_identifiers.append(identifier)
        try:
            reactant_links[identifier] += 1
        except KeyError:
            reactant_links[identifier] = 1

    for mol in products:
        try:
            identifier = get_mol_identifier(mol)
        except RuntimeError:
            continue
        product_identifiers.append(identifier)
        try:
            product_links[identifier] += 1
        except KeyError:
            product_links[identifier] = 1

    reaction_key = get_rxn_identifier(reactant_identifiers, product_identifiers)

    reactions.reactions.append(Reaction(
        _key=reaction_key, source=annotation.source, 
        curated=annotation.curated, comments=annotation.comments + comments, curated_by=annotation.curated_by,
        smiles=annotation.smiles, crossref=annotation.crossref
    ))
    
    for link, n in reactant_links.items():
        links.reactants.append(ReactantLink(_from='mainstage/' + link, _to='mainstage/' + reaction_key, n=n))
    for link, n in product_links.items():
        links.products.append(ProductLink(_to='mainstage/' + link, _from='mainstage/' + reaction_key, n=n))


    
    reaction_annotation_links.links.append(ReactionAnnotationLink(_from='staging/' + annotation.key, _to='mainstage/' + reaction_key))
    

In [10]:
def process_links(link):
    link['_from'] = 'mainstage/' + link['_from']
    link['_to'] = 'mainstage/' + link['_to']
    return link

In [11]:
with open('/home/bcz/scratch/prebchem-upload/molecules.json', 'w') as f:
    json.dump(molecules.dict(by_alias=True)['molecules'], f, indent=4)

with open('/home/bcz/scratch/prebchem-upload/reactions.json', 'w') as f:
    json.dump(reactions.dict(by_alias=True)['reactions'], f, indent=4)

with open('/home/bcz/scratch/prebchem-upload/links.reactants-reaction.json', 'w') as f:
    json.dump(links.dict(by_alias=True)['reactants'], f, indent=4)

with open('/home/bcz/scratch/prebchem-upload/links.products-reaction.json', 'w') as f:
    json.dump(links.dict(by_alias=True)['products'], f, indent=4)

with open('/home/bcz/scratch/prebchem-upload/links.reaction-annotation.json', 'w') as f:
    json.dump(reaction_annotation_links.dict(by_alias=True)['links'], f, indent=4)

In [12]:
len(molecules_keys)

1072

In [13]:
failures

[ReactionAnnotation(source='10.1089/ast.2005.5.749', smiles='nan', agents=[], waste=[], conditions=['temperature=175C', 'pressure=0.9390565883665859atm'], crossref=[], comments='reviewed by D. cole', primary='CH3S-SCH3 + 3 H2S -> C2H4S5 + 4H2', attributes=[], curated=False, curated_by='', key='144802959'),
 ReactionAnnotation(source='10.1016/0016-7037(64)90147-4', smiles='nan', agents=[], waste=[], conditions=[], crossref=[], comments='reviewed by D. cole', primary='C9H11NO2 -> C8H11N + C7H9N', attributes=[], curated=False, curated_by='', key='144802979'),
 ReactionAnnotation(source='10.1016/0016-7037(64)90147-4', smiles='nan', agents=[], waste=[], conditions=[], crossref=[], comments='reviewed by D. cole', primary='C6H14N4O2 -> C5H9NO2', attributes=[], curated=False, curated_by='', key='144802981'),
 ReactionAnnotation(source='10.1016/0016-7037(93)90540-D', smiles='nan', agents=[], waste=[], conditions=[], crossref=[], comments='reviewed by D. cole', primary='H2NCHRCOOH* -> RCH2NH2 + 