In [None]:
import collections
import functools
import http
import json
import logging
import os
import urllib

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib_venn import venn3, venn3_circles
from rdkit import Chem
from tqdm.notebook import tqdm

In [None]:
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)

tqdm.pandas(leave=False)

logging.basicConfig(format='%(asctime)s [%(levelname)s/%(processName)s] '
                           '%(module)s.%(funcName)s : %(message)s',
                    level=logging.INFO)

In [None]:
@functools.lru_cache(None)
def smiles_to_inchikey(smiles):
    try:
        mol = Chem.rdmolfiles.MolFromSmiles(smiles)
    except ValueError:
        mol = None
    return Chem.rdinchi.MolToInchiKey(mol) if mol is not None else None


@functools.lru_cache(None)
def inchikey_to_subclass(inchikey):
    try:
        with urllib.request.urlopen(f'https://gnps-classyfire.ucsd.edu/'
                                    f'entities/{inchikey}.json') \
                as f_url:
            return json.loads(f_url.read())['subclass']['chemont_id']
    except (urllib.error.HTTPError, AttributeError, TypeError, KeyError):
        return None

In [None]:
data_dir = '../data/mshub_annotation'
compounds = set(pd.read_csv(os.path.join(
    data_dir, 'Metabolomics Workbench ST001154.csv'))['InChiKey'])
subclasses = set([inchikey_to_subclass(inchikey) for inchikey in compounds])
subclasses.discard(None)

In [None]:
task_ids = {
    5:'66af7b6e', 10: 'f40db6d9', 20: 'af8a292b', 30: '7fcb253c',
    40: '8a126cec', 50: '562d5808', 100: 'ad515651', 150: 'c757c34f',
    200: '307bcc2a', 250: 'd132c3e3', 300: '4f2de61a'}

identifications = collections.defaultdict(dict)
for top in (1, 10):
    for num_files, task_id in task_ids.items():
        # Read the GNPS task file.
        ids = (
            pd.read_csv(
                os.path.join(
                    data_dir, f'MOLECULAR-LIBRARYSEARCH-GC-{task_id}-'
                    f'view_all_annotations_DB-main.tsv'),
                sep='\t',
                usecols=['#Scan#', 'INCHI', 'Smiles', 'MQScore',
                         'SharedPeaks'],
                skipinitialspace=True)
            .dropna()
            .sort_values(['#Scan#', 'MQScore'], ascending=[True, False])
            .groupby('#Scan#').head(top))
        # Require a minimum cosine score of 0.7 and
        # more than 10 matched peaks.
        ids = ids[(ids['MQScore'] > 0.7) & (ids['SharedPeaks'] > 10)]
        # Get InChIKeys and subclasses for all identifications.
        ids['InChIKey'] = ids['Smiles'].apply(smiles_to_inchikey)
        ids['subclass'] = \
            ids['InChIKey'].progress_apply(inchikey_to_subclass)
        ids.dropna()

        identifications[top][num_files] = ids

In [None]:
tops, num_files, match_type, perc_matched = [], [], [], []
for top, num_files_ids in identifications.items():
    for nf, ids in num_files_ids.items():
            tops.append(top)
            num_files.append(nf)
            match_type.append('Direct match')
            perc_matched.append(
                len(set(ids['InChIKey']) & compounds) / len(compounds))
            tops.append(top)
            num_files.append(nf)
            match_type.append('Subclass match')
            perc_matched.append(
                len(set(ids['subclass']) & subclasses) / len(subclasses))

matched = pd.DataFrame({
    'Top': tops, 'Number of files':num_files, 'Type': match_type,
    'Compounds identified': perc_matched})

fig, axes = plt.subplots(1, 2, True, True, figsize=(12, 4))

for top, ax in zip((1, 10), axes):
    matched_top_direct = matched[(matched['Top'] == top) &
                                 (matched['Type'] == 'Direct match')]
    matched_top_subclass = matched[(matched['Top'] == top) &
                                   (matched['Type'] == 'Subclass match')]
    ax.plot(matched_top_direct['Number of files'],
            matched_top_direct['Compounds identified'],
            label='Direct match', marker='o', linewidth=2, markersize=7)
    ax.plot(matched_top_subclass['Number of files'],
            matched_top_subclass['Compounds identified'],
            label='Subclass match', marker='o', linewidth=2, markersize=7)
    
    ax.set_ylim(0, 1)
    ax.yaxis.set_major_formatter(mticker.PercentFormatter(1))
    
    ax.set_xlabel('Number of files')
    ax.set_title(f'Top = {top}')

fig.tight_layout()
    
axes[0].set_ylabel('Compounds identified')
axes[1].legend(loc='upper center', bbox_to_anchor=(0, 1.35),
               title='Type', fontsize='medium', ncol=2)
    
sns.despine()

plt.savefig('mshub_annotation.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

for top in (1, 10):
    ids_top = identifications[top]
    score_median = np.asarray([np.mean(ids_num_files['MQScore'])
                               for ids_num_files in ids_top.values()])
    score_std = [np.std(ids_num_files['MQScore'])
                 for ids_num_files in ids_top.values()]
    ax.plot(list(ids_top.keys()), score_median, label=top,
            marker='o', linewidth=2, markersize=7)
    ax.fill_between(list(ids_top.keys()), score_median - score_std,
                    score_median + score_std, alpha=0.25)

# ax.set_ylim(0, 1)

ax.set_xlabel('Number of files')
ax.set_ylabel('Cosine score')
    
ax.legend(loc='lower right', title='Top')

sns.despine()

plt.savefig('mshub_annotation_cosine.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()