In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem

In [None]:
features = pd.read_csv('../data/emp500_lcms_fbmn_feature_metadata_all.txt',
                       sep='\t', low_memory=False)

In [None]:
npatlas = pd.read_csv('https://www.npatlas.org/custom/versions/'
                      'np_atlas_2020_06/NPAtlas_download.tsv',
                      sep='\t')

In [None]:
def _inchi_to_inchikey(inchi):
    if pd.isnull(inchi) or not inchi.strip():
        return np.nan
    else:
        if not inchi.startswith('InChI='):
            inchi = f'InChI={inchi.strip()}'
        return Chem.inchi.InchiToInchiKey(inchi)

# Convert InChIs to the first block of the InChI key (no stereochemistry).
features['inchikey_nostereo'] = (features['GNPS_INCHI']
                                 .apply(_inchi_to_inchikey)
                                 .str.split('-').str[0])
npatlas['inchikey_nostereo'] = (npatlas['compound_inchikey']
                                .str.split('-').str[0])
# Merge duplicate NP Atlas entries.
npatlas = (npatlas.groupby('inchikey_nostereo')
           [['npaid', 'compound_id', 'compound_names', 'origin_type',
             'genus', 'origin_species', 'mibig_ids', 'gnps_ids']]
           .agg(lambda values: '|'.join([str(v) for v in values]))
           .reset_index())

In [None]:
merged = pd.merge(features, npatlas, 'left', 'inchikey_nostereo')
merged.to_csv('../data/emp500_lcms_fbmn_feature_metadata_all_npatlas.txt',
              sep='\t', index=False)