In [None]:
import os
import re

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('tab20')
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

## Functions

In [None]:
def get_sample_types(gfop_metadata, sample_type, simple_complex=None):
    if simple_complex is not None:
        gfop_metadata = gfop_metadata[
            gfop_metadata['simple_complex'] == simple_complex]
    return gfop_metadata[['filename', sample_type]].set_index('filename')

In [None]:
def get_network_group_sample_type_counts(
        gnps_network, sample_types, groups_included):
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set(groups_included)
    df_selected = gnps_network[
        (gnps_network[groups_included] > 0).all(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    filenames = df_selected['UniqueFileSources'].str.split('|').explode()
    # Match the GNPS job results to the sample types and get counts.
    return (sample_types.reindex(filenames).fillna('unmatched')
            .squeeze().value_counts())

In [None]:
re_prefix = re.compile('^([\d+_]+)_.*$')

def get_dataset_sample_type_counts(data_dir, datasets, sample_types):
    counts = []
    for dataset in os.listdir(data_dir):
        dataset_dir = os.path.join(data_dir, dataset)
        prefix = re_prefix.findall(os.path.basename(dataset_dir))
        if os.path.isdir(dataset_dir) and len(prefix) > 0:
            num = set([int(n) for n in prefix[0].split('_')])
            for n in (folder_num_included & num):
                for filename in os.listdir(dataset_dir):
                    if filename.startswith('METABOLOMICS-SNETS-V2-'):
                        gnps_network = pd.read_csv(
                            os.path.join(dataset_dir, filename), sep='\t')
                        groups = (['G3', 'G4']
                                  if (gnps_network['G3'] > 0).any()
                                  else ['G4'])
                        groups.append(datasets.loc[n, 'Group_human'])
                        counts.append(
                            get_network_group_sample_type_counts(
                                gnps_network, sample_types, groups)
                            .rename(datasets.loc[n, 'study_acronym']))
                        break
    counts = pd.concat(counts, axis=1).fillna(0).astype(int).T
    study_order = (datasets.dropna(subset=['study_acronym'])
                   .set_index('study_acronym')
                   .reindex(counts.index).reset_index()
                   .sort_values(['sample_specific', 'dataset_short', 'index'],
                                'index'))
    return counts.reindex(study_order['index'].rename('study'))

In [None]:
def plot_stacked_bar(df, filename):
    width = 14
    height = width / 1.618
    fig, ax = plt.subplots(figsize=(width, height))

    df.plot.bar(ax=ax, stacked=True)

    ax.yaxis.set_major_formatter(
        mticker.FuncFormatter(lambda x, p: format(int(x), ',')))

    ax.set_xlabel('Study')
    ax.set_ylabel('Number of MS/MS spectra')

    ax.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), ncol=2)

    sns.despine()

    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

## GFOP sample type metadata 

In [None]:
gfop_metadata = pd.read_csv(
    '../data/11442_foodomics_multiproject_metadata.txt', sep='\t')
# First row is empty.
gfop_metadata = gfop_metadata.drop(index=0)
# Remove trailing whitespace.
gfop_metadata = gfop_metadata.apply(lambda col: col.str.strip()
                                    if col.dtype == 'object' else col)

## Datasets included

In [None]:
folder_num_included = set(range(2, 34)) | set([40])
for n in [4, 6, 23, 25]:
    folder_num_included.discard(n)
data_dir = '../data'

In [None]:
datasets = (pd.read_csv('../data/foods_study_overview.csv')
            .dropna(subset=['folder_num', 'study_num']))
datasets['folder_num'] = datasets['folder_num'].astype(int)
datasets['study_num'] = datasets['study_num'].astype(int)
datasets = datasets.set_index('folder_num')

In [None]:
sample_type_hierarchy = (
    pd.read_csv('../data/sample_type_hierarchy.csv')
    .set_index('descriptor').sort_values('order_num'))

## Food types overview

In [None]:
sample_types = get_sample_types(gfop_metadata, 'sample_type_group4')
food_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, sample_types)
    .dropna(axis=0, how='all').fillna(0).astype(int))
food_order = (sample_type_hierarchy['order_num']
              .reindex(food_counts.columns + '_4')
              .sort_values().index.str[:-2])
food_counts = food_counts[food_order]
plot_stacked_bar(food_counts, 'overview_food_types.png')

## Fermented status

In [None]:
fermented_status = get_sample_types(gfop_metadata, 'fermented')
fermented_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, fermented_status)
    .rename(columns={'yes': 'fermented', 'no': 'not fermented'})
    [['fermented', 'not fermented', 'not entered']])
plot_stacked_bar(fermented_counts, 'overview_fermented.png')

## Beverage types

In [None]:
beverage_types = get_sample_types(gfop_metadata, 'sample_type_groupB3')
beverage_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, beverage_types)
    .drop(columns=['not applicable', 'unmatched']))
plot_stacked_bar(beverage_counts, 'overview_beverage.png')

## Botanical families

In [None]:
botanical_families = get_sample_types(gfop_metadata, 'botanical_family')
botanical_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, botanical_families)
    .drop(columns=['not collected', 'not entered', 'not applicable',
                   'unmatched']))
plot_stacked_bar(botanical_counts, 'overview_botanical.png')

## Analysis 2269

In [None]:
analysis_2269_types = get_sample_types(gfop_metadata, 'analysis_2269')
analysis_2269_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, analysis_2269_types)
    .drop(columns=['unmatched']))
plot_stacked_bar(analysis_2269_counts, 'overview_2269.png')

## Cheese types

In [None]:
cheese_types = get_sample_types(gfop_metadata, 'cheese_sample_type')
cheese_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, cheese_types)
    .drop(columns=['not applicable', 'not entered', 'unmatched']))
plot_stacked_bar(cheese_counts, 'overview_cheese.png')

## Main food description

In [None]:
main_types = get_sample_types(gfop_metadata, 'Main.food.description')
main_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, main_types)
    .drop(columns=['not entered', 'unmatched']))
plot_stacked_bar(main_counts, 'overview_main_food_description.png')

## Processing method

In [None]:
processed_types = get_sample_types(gfop_metadata, 'processed_method_jps')
processed_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, processed_types)
    .drop(columns=['not applicable', 'not entered', 'unmatched']))
plot_stacked_bar(processed_counts, 'overview_processed.png')

## Raw processing method

In [None]:
raw_processed_types = get_sample_types(gfop_metadata, 'raw_processed')
raw_processed_counts = (
    get_dataset_sample_type_counts(data_dir, datasets, raw_processed_types)
    .drop(columns=['not entered', 'not applicable', 'unmatched']))
plot_stacked_bar(raw_processed_counts, 'overview_raw_processed.png')

## Number of foods per sample type

In [None]:
# Attention! This overwrites the previous definition of
# `get_network_group_sample_type_counts`.
def get_network_group_sample_type_counts(
        gnps_network, sample_types, groups_included):
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set(groups_included)
    df_selected = gnps_network[
        (gnps_network[groups_included] > 0).all(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    filenames = df_selected['UniqueFileSources'].str.split('|').explode()
    return (df_selected['UniqueFileSources'].str.split('|').apply(
        lambda filenames: sample_types.reindex(filenames).dropna()
        ['sample_type_group4'].nunique())).reset_index(drop=True)

In [None]:
sample_counts = pd.melt(get_dataset_sample_type_counts(
    data_dir, datasets, sample_types).reset_index(), id_vars='study')
num_foods = pd.DataFrame(data={
    'Sample type': np.stack(sample_counts['study'].str.split(' - '))[:, 1],
    'Number of foods matched': sample_counts['value']})

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

sns.violinplot(x='Sample type', y='Number of foods matched', data=num_foods)

ax.set_ylabel('Number of unique foods matched per cluster')

sns.despine()

plt.savefig('overview_num_foods', dpi=300, bbox_inches='tight')
plt.show()
plt.close()