In [None]:
import os
import re

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

### GFOP sample type metadata 

In [None]:
def get_sample_fermented_status(gfop_metadata, simple_complex=None):
    if simple_complex is not None:
        gfop_metadata = gfop_metadata[
            gfop_metadata['simple_complex'] == simple_complex]
    return gfop_metadata[['filename', 'fermented']].set_index('filename')

In [None]:
def get_network_group_fermented_count(
        gnps_network, fermented_status, groups_included):
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set(groups_included)
    df_selected = gnps_network[
        (gnps_network[groups_included] > 0).all(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    filenames = df_selected['UniqueFileSources'].str.split('|').explode()
    # Match the GNPS job results to the fermented status and get counts.
    fermented = fermented_status.reindex(filenames).fillna('unmatched')
    return fermented.squeeze().value_counts() / len(fermented)

In [None]:
gfop_metadata = pd.read_csv(
    '../data/11442_foodomics_multiproject_metadata.txt', sep='\t')
# First row is empty.
gfop_metadata = gfop_metadata.drop(index=0)
# Remove trailing whitespace.
gfop_metadata = gfop_metadata.apply(lambda col: col.str.strip()
                                    if col.dtype == 'object' else col)

### Compare fermented vs. non-fermented

In [None]:
sample_fermented_status = get_sample_fermented_status(
    gfop_metadata, 'simple')

In [None]:
fermented_all = pd.read_csv(
    '../data/37_Danone - all - split - FoodOmics 3500 FDR 0.1/'
    'METABOLOMICS-SNETS-V2-602ba94c-view_all_clusters_withID_beta-main.tsv',
    sep='\t')
fermented_strict = pd.read_csv(
    '../data/38_Danone - ferment strict - nonferment strict - FoodOmics 3500 FDR 0.1/'
    'METABOLOMICS-SNETS-V2-0aaa25a8-view_all_clusters_withID_beta-main.tsv',
    sep='\t')

In [None]:
fermented = pd.DataFrame({
    'All G1G4': get_network_group_fermented_count(
        fermented_all, sample_fermented_status, ['G1', 'G4']),
    'All G2G4': get_network_group_fermented_count(
        fermented_all, sample_fermented_status, ['G2', 'G4']),
    'Strict G1G4': get_network_group_fermented_count(
        fermented_strict, sample_fermented_status, ['G1', 'G4']),
    'Strict G2G4': get_network_group_fermented_count(
        fermented_strict, sample_fermented_status, ['G2', 'G4'])})
fermented = fermented.loc[['yes', 'no', 'unmatched']]

In [None]:
fermented

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

fermented.plot.bar(ax=ax)

ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))
ax.xaxis.set_tick_params(rotation=0)

ax.set_xlabel('Fermented status')
ax.set_ylabel('Observed foods')

ax.legend(loc='upper left')

sns.despine()

plt.savefig('fermented.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()