In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
suspects = pd.read_csv(
    '../../data/processed/suspect_list_batch_creation.tsv', sep='\t'
)
suspect_formulas = suspects['COMPOUND_NAME'].str.extract(r'formula: ([^)]+)',
                                                         expand=False)

In [None]:
suspect_formula_not_found = (suspect_formulas == 'unknown').sum()
suspects_formula_found = len(suspect_formulas) - suspect_formula_not_found
print(f'Suspects with molecular formula found: {suspects_formula_found:,}')
print(f'Suspects with molecular formula NOT found: {suspect_formula_not_found:,}')

In [None]:
suspect_contains_ch = suspect_formulas.str.match(r'^[CH0-9]+$').sum()
suspect_contains_cho = suspect_formulas.str.match(r'^(C[0-9]*)+(H[0-9]*)+(O[0-9]*)+$').sum()
suspect_contains_chno = suspect_formulas.str.match(r'^(C[0-9]*)+(H[0-9]*)+(N[0-9]*)+(O[0-9]*)+$').sum()
suspect_contains_p = suspect_formulas.str.contains('P').sum()
suspect_contains_s = suspect_formulas.str.contains('S').sum()

In [None]:
library_entries = pd.read_csv(
    '../../data/interim/composition_filtered_library.csv', low_memory=False)
default_libraries = [
    'BILELIB19', 'CASMI', 'DEREPLICATOR_IDENTIFIED_LIBRARY',
    'GNPS-COLLECTIONS-MISC', 'GNPS-COLLECTIONS-PESTICIDES-NEGATIVE',
    'GNPS-COLLECTIONS-PESTICIDES-POSITIVE', 'GNPS-EMBL-MCF',
    'GNPS-FAULKNERLEGACY', 'GNPS-LIBRARY', 'GNPS-NIH-CLINICALCOLLECTION1',
    'GNPS-NIH-CLINICALCOLLECTION2', 'GNPS-NIH-NATURALPRODUCTSLIBRARY',
    'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_NEGATIVE',
    'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_POSITIVE',
    'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE', 'GNPS-NIST14-MATCHES',
    'GNPS-PRESTWICKPHYTOCHEM', 'GNPS-SELLECKCHEM-FDA-PART1',
    'GNPS-SELLECKCHEM-FDA-PART2', 'HMDB', 'LDB_NEGATIVE', 'LDB_POSITIVE',
    'MASSBANK', 'MASSBANKEU', 'MIADB', 'MMV_NEGATIVE', 'MMV_POSITIVE', 'MONA',
    'PNNL-LIPIDS-NEGATIVE', 'PNNL-LIPIDS-POSITIVE', 'RESPECT', 'SUMNER'
]
library_entries = library_entries[library_entries['library_membership']
                                  .str.upper().isin(default_libraries)]
library_formulas = library_entries['Formula_smiles'].dropna()
n_library_chnops = library_formulas.str.match(f'^[CHNOPS0-9]+$').sum()

In [None]:
n_library_chnops = library_formulas.str.match(f'^[CHNOPS0-9]+$').sum()

In [None]:
library_contains_ch = library_formulas.str.match(r'^[CH0-9]+$').sum()
library_contains_cho = library_formulas.str.match(r'^(C[0-9]*)+(H[0-9]*)+(O[0-9]*)+$').sum()
library_contains_chno = library_formulas.str.match(r'^(C[0-9]*)+(H[0-9]*)+(N[0-9]*)+(O[0-9]*)+$').sum()
library_contains_p = library_formulas.str.contains('P').sum()
library_contains_s = library_formulas.str.contains('S').sum()

In [None]:
formula_counts = pd.DataFrame(
    {'count': [suspect_contains_ch / suspects_formula_found,
               suspect_contains_cho / suspects_formula_found,
               suspect_contains_chno / suspects_formula_found,
               suspect_contains_p / suspects_formula_found,
               suspect_contains_s / suspects_formula_found,
               library_contains_ch / n_library_chnops,
               library_contains_cho / n_library_chnops,
               library_contains_chno / n_library_chnops,
               library_contains_p / n_library_chnops,
               library_contains_s / n_library_chnops,],
     'type': ['Suspect library', 'Suspect library', 'Suspect library',
              'Suspect library', 'Suspect library',
              'Default GNPS libraries', 'Default GNPS libraries',
              'Default GNPS libraries', 'Default GNPS libraries',
              'Default GNPS libraries'],
     'formula': ['CH', 'CHO', 'CHNO', 'P', 'S',
                 'CH', 'CHO', 'CHNO', 'P', 'S']
    }
)
formula_counts.to_parquet('fig1_composition.parquet', index=False)

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

formula_counts_perc = formula_counts.copy()
formula_counts_perc['count'] *= 100
sns.barplot(data=formula_counts_perc, x='formula', y='count', hue='type',
            ax=ax)

ax.yaxis.set_major_formatter(mticker.PercentFormatter(100))

ax.set_xlabel('')
ax.set_ylabel('Proportion of library')

ax.legend(title=False, loc='upper right')

sns.despine(ax=ax)

plt.savefig('composition.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()