In [None]:
import numpy as np
import pandas as pd

### GFOP sample type metadata 

In [None]:
def get_sample_types(gfop_metadata, simple_complex=None):
    if simple_complex is not None:
        gfop_metadata = gfop_metadata[
            gfop_metadata['simple_complex'] == simple_complex]
    col_sample_types = [f'sample_type_group{i}' for i in range(1, 7)]
    return (gfop_metadata[['filename', *col_sample_types]]
            .set_index('filename'))

In [None]:
gfop_metadata = pd.read_csv(
    '../data/11442_foodomics_multiproject_metadata.txt', sep='\t')
# First row is empty.
gfop_metadata = gfop_metadata.drop(index=0)
# Remove trailing whitespace.
gfop_metadata = gfop_metadata.apply(lambda col: col.str.strip()
                                    if col.dtype == 'object' else col)

### Food count per file

In [None]:
def get_file_food_counts(gnps_network, sample_types, groups_included,
                         filenames_included, level):
    # Select GNPS job groups.
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set(groups_included)
    df_selected = gnps_network[
        (gnps_network[groups_included] > 0).all(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    df_selected = df_selected[
        df_selected['UniqueFileSources'].apply(lambda cluster_fn:
            any(fn in cluster_fn for fn in filenames_included))]
    filenames = (df_selected['UniqueFileSources'].str.split('|')
                 .explode())
    # Select food hierarchy levels.
    sample_types = sample_types[f'sample_type_group{level}']
    # Match the GNPS job results to the food sample types.
    sample_types_selected = sample_types.reindex(filenames)
    sample_types_selected = sample_types_selected.dropna()
    # Discard samples that occur less frequent than water (blank).
    water_count = (sample_types_selected == 'water').sum()
    sample_counts = sample_types_selected.value_counts()
    sample_counts_valid = sample_counts.index[sample_counts > water_count]
    sample_types_selected = sample_types_selected[
        sample_types_selected.isin(sample_counts_valid)]
    # Get sample counts at the specified level.
    return sample_types_selected.value_counts()

In [None]:
sample_types_simple = get_sample_types(gfop_metadata, 'simple')
sample_types_complex = get_sample_types(gfop_metadata, 'complex')

In [None]:
metadata = pd.read_csv(
    '../data/27_33_AD Plasma - CSF - FoodOmics 3500 FDR 0.01 tol 0.01 2 mincluster/'
    'AD_metadata_JMG.txt', sep='\t')

In [None]:
gnps_network = pd.read_csv(
    '../data/27_33_AD Plasma - CSF - FoodOmics 3500 FDR 0.01 tol 0.01 2 mincluster/'
    'METABOLOMICS-SNETS-V2-aac78e9d-view_all_clusters_withID_beta-main.tsv',
    sep='\t')

In [None]:
level = 4
food_counts, filenames = [], []
for sample_type, groups in [('Plasma', ['G1', 'G4']), ('CSF', ['G2', 'G4'])]:
    for filename in (metadata[metadata['Specimen_Type'] == sample_type]
                     ['filename']):
        file_food_counts = get_file_food_counts(
            gnps_network, sample_types_simple, groups, [filename], level)
        if len(file_food_counts) > 0:
            food_counts.append(file_food_counts)
            filenames.append(filename)

In [None]:
food_counts = (pd.concat(food_counts, axis=1, sort=True)
               .fillna(0).astype(int).T)
food_counts.index = pd.Index(filenames, name='filename')

In [None]:
food_counts.to_csv('ad_file_food_count.csv')