In [None]:
import json

import numpy as np
import pandas as pd
import requests
import requests_cache
import tqdm.notebook as tqdm
from rdkit import Chem

In [None]:
requests_cache.install_cache('emp500_cache')

In [None]:
metadata = pd.read_csv('../data/emp500_gcms_feature_metadata_annotations.txt',
                       sep='\t')

In [None]:
def get_class(mol_type, mol_val):
    r = requests.get(f'https://gnps-structure.ucsd.edu/classyfire?{mol_type}={mol_val}')
    if r.status_code != 200:
        return None
    try:
        classyfire_json = r.json()
        if not classyfire_json:
            return None
        if ('superclass' not in classyfire_json or
                'class' not in classyfire_json or
                'subclass' not in classyfire_json):
            return None
        superclass = classyfire_json['superclass']
        if superclass is not None:
            superclass = superclass['name']
        clss = classyfire_json['class']
        if clss is not None:
            clss = clss['name']
        subclass = classyfire_json['subclass']
        if subclass is not None:
            subclass = subclass['name']
        return superclass, clss, subclass
    except json.decoder.JSONDecodeError:
        return None    

In [None]:
metadata_missing = metadata[metadata.isna()[['superclass', 'class', 'subclass']]
                            .sum(axis=1).astype(bool)]
inchi_smiles = metadata_missing[['INCHI', 'Smiles']].dropna(how='all')
classes = []
for i, compound in tqdm.tqdm(inchi_smiles.iterrows(), total=len(inchi_smiles)):
    inchi, smiles = compound['INCHI'], compound['Smiles']
    result = get_class('inchi', inchi) if inchi is not None else None
    if result is None:
        if pd.isna(smiles):
            continue
        mol = Chem.MolFromSmiles(smiles)
        result = (get_class('smiles', Chem.MolToSmiles(mol, False))
                  if mol is None else None)
    if result is not None:
        classes.append((i, *result))

In [None]:
classes = pd.DataFrame(classes, columns=['id', 'superclass', 'class', 'subclass'])

In [None]:
metadata.loc[classes['id'], 'superclass'] = classes['superclass'].values
metadata.loc[classes['id'], 'class'] = classes['class'].values
metadata.loc[classes['id'], 'subclass'] = classes['subclass'].values

In [None]:
metadata.to_csv('../data/emp500_gcms_feature_metadata_annotations_classyfire.txt',
                sep='\t')