In [None]:
import itertools

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

### GFOP sample type metadata 

In [None]:
def get_sample_types(gfop_metadata, simple_complex=None):
    if simple_complex is not None:
        gfop_metadata = gfop_metadata[
            gfop_metadata['simple_complex'] == simple_complex]
    col_sample_types = [f'sample_type_group{i}' for i in range(1, 7)]
    return (gfop_metadata[['filename', *col_sample_types]]
            .set_index('filename'))

In [None]:
def get_network_group_sample_type_counts(gnps_network, sample_types,
                                         groups_included):
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set(groups_included)
    df_selected = gnps_network[
        (gnps_network[groups_included] > 0).all(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    filenames = (df_selected['UniqueFileSources'].str.split('|')
                 .explode().unique())
    return sample_types.reindex(filenames).value_counts()

In [None]:
gfop_metadata = pd.read_csv(
    '../data/11442_foodomics_multiproject_metadata.txt',
    sep='\t')
# First row is empty.
gfop_metadata = gfop_metadata.drop(index=0)
# Remove trailing whitespace.
gfop_metadata = gfop_metadata.apply(lambda col: col.str.strip()
                                    if col.dtype == 'object' else col)

### CIAO wrong vs correct SSF

In [None]:
gnps_network_wrong_ssf = pd.read_csv(
    '../data/6_25_CIAO Fecal - plasma - ONR food - Foodomics 3500 (no ONR food) FDR 0.01/'
    'METABOLOMICS-SNETS-V2-981c9a7d-view_all_clusters_withID_beta-main.tsv',
    sep='\t')
gnps_network_correct_ssf = pd.read_csv(
    '../data/5_24_CIAO Fecal - plasma - food - Foodomics 3500 (no CIAO food) FDR 0.01/'
    'METABOLOMICS-SNETS-V2-8895b6e3-view_all_clusters_withID_beta-main.tsv',
    sep='\t')

In [None]:
sample_types_simple = (get_sample_types(gfop_metadata, 'simple')
                       ['sample_type_group5'])
sample_counts_wrong_ssf = (
    get_network_group_sample_type_counts(
        gnps_network_wrong_ssf, sample_types_simple, ['G2', 'G3', 'G4'])
    .rename('CIAO, plasma, wrong SSF'))
sample_counts_correct_ssf = (
    get_network_group_sample_type_counts(
        gnps_network_correct_ssf, sample_types_simple, ['G2', 'G3', 'G4'])
    .rename('CIAO, plasma, correct SSF'))

In [None]:
sample_counts = (pd.concat([sample_counts_wrong_ssf,
                            sample_counts_correct_ssf], axis=1, sort=True)
                 .dropna().astype(int))

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

(sample_counts.sort_values('CIAO, plasma, wrong SSF', ascending=False)
 .head(15).plot.bar(ax=ax))

ax.set_xlabel('Food type')
ax.set_ylabel('Number of occurences')

sns.despine()

plt.savefig('ciao_wrong_correct_ssf.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()