In [None]:
import collections
import functools
import logging
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib_venn import venn2, venn2_circles
from rdkit import Chem

In [None]:
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)

logging.basicConfig(format='%(asctime)s [%(levelname)s/%(processName)s] '
                           '%(module)s.%(funcName)s : %(message)s',
                    level=logging.INFO)

In [None]:
@functools.lru_cache(None)
def smiles_to_inchikey(smiles):
    try:
        mol = Chem.rdmolfiles.MolFromSmiles(smiles)
    except ValueError:
        mol = None
    return Chem.rdinchi.MolToInchiKey(mol) if mol is not None else None

In [None]:
data_dir = '../data/public_commercial_library'

In [None]:
task_ids = {'Public libraries': '6a8bec14',
            'Commercial libraries': '65ece253'}

identifications = collections.defaultdict(
    lambda: collections.defaultdict(dict))
for top in (1, 10):
    for library, task_id in task_ids.items():
        # Read the GNPS task file.
        usecols = ['#Scan#', 'INCHI', 'Smiles', 'MQScore',
                   'Balance_score(percentage)', 'SharedPeaks']
        ids = (
            pd.read_csv(
                os.path.join(
                    data_dir, f'MOLECULAR-LIBRARYSEARCH-GC-{task_id}-'
                    f'view_all_annotations_DB-main.tsv'),
                sep='\t', usecols=usecols, skipinitialspace=True)
            .dropna()
            .sort_values(['#Scan#', 'MQScore'], ascending=[True, False])
            .groupby('#Scan#').head(top))
        # Require a minimum cosine score of 0.8, minimum balance score of
        # 80%, and more than 10 matched peaks.
        ids = ids[(ids['MQScore'] > 0.8) &
                  (ids['Balance_score(percentage)'] > 80) &
                  (ids['SharedPeaks'] > 10)]
        # Get InChIKeys for all identifications.
        ids['InChIKey'] = ids['Smiles'].apply(smiles_to_inchikey)
        ids.dropna()

        identifications[top][library] = ids

In [None]:
height = 8
width = height
num_col = len(identifications)
fig, axes = plt.subplots(1, num_col, figsize=(width * num_col, height))

for ax, (top, top_ids) in zip(axes, identifications.items()):
    sets =[set(library_ids['InChIKey']) for library_ids in top_ids.values()]
    v = venn2(sets, set_labels=top_ids.keys(), ax=ax)
    c = venn2_circles(sets, linewidth=1.0, ax=ax)
    for text in v.subset_labels:
        if text is not None:
            text.set_fontsize('large')

    ax.set_title(f'Top = {top}', fontsize='large')

plt.savefig('public_commercial_library.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()