In [None]:
import numpy as np
import pandas as pd

### GFOP sample type metadata 

In [None]:
gfop_metadata = pd.read_csv(
    '../GlobalFoodomics/data/11442_foodomics_multiproject_metadata.txt', sep='\t')
# First row is empty.
gfop_metadata = gfop_metadata.drop(index=0)
# Remove trailing whitespace.
gfop_metadata = gfop_metadata.apply(lambda col: col.str.strip()
                                    if col.dtype == 'object' else col)

### Food count per file

In [None]:
def get_file_food_counts(gnps_network, sample_types, all_groups, some_groups,
                         filenames_included):
    # Select GNPS job groups.
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set([*all_groups, *some_groups])
    df_selected = gnps_network[
        (gnps_network[all_groups] > 0).all(axis=1) &
        (gnps_network[some_groups] > 0).any(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    df_selected = df_selected[
        df_selected['UniqueFileSources'].apply(lambda cluster_fn:
            any(fn in cluster_fn for fn in filenames_included))]
    filenames = (df_selected['UniqueFileSources'].str.split('|')
                 .explode())
    # Select food sample names (values match terminal leaves of ontology).
    sample_types = sample_types['sample_name']
    # Match the GNPS job results to the food sample types.
    sample_types_selected = sample_types.reindex(filenames)
    sample_types_selected = sample_types_selected.dropna()
    # Discard samples that occur less frequent than water (blank).
    #water_count = sample_types_selected.isin(['11442.G74065','11442.G74067','11442.G74122','11442.G83366','G96583','G96584','G96585','G96586','G96611','G96612','G96613','G96618','G96619','G96639','G96684','G96713']).sum()
    # leave this out for now.. water_count is too high compared to range of sample_counts - need to think of how to filter out noise
    ## first remove any molecules found in water?
    water_count = 0
    sample_counts = sample_types_selected.value_counts()
    sample_counts_valid = sample_counts.index[sample_counts > water_count]
    sample_types_selected = sample_types_selected[
        sample_types_selected.isin(sample_counts_valid)]
    # Get sample counts
    return sample_types_selected.value_counts()

In [None]:
sample_types = gfop_metadata.set_index('filename')

In [None]:
metadata = pd.read_csv(
    'data/IBD200_revamped_metadata_20200707_KAW.txt', sep='\t')
filename_col = 'Metabolomics.FileName.Run1' # column header for file names

In [None]:
gnps_network = pd.read_csv(
    'data/IBD_2-view_all_clusters_withID_beta-main.tsv',
    sep='\t')

In [None]:
food_counts, filenames = [], []
    
#Explicit selection of categories.
some_groups = ['G4']
all_groups = ['G1']
for filename in metadata[filename_col]:
    file_food_counts = get_file_food_counts(
        gnps_network, sample_types_simple, all_groups, some_groups, [filename])
    if len(file_food_counts) > 0:
        food_counts.append(file_food_counts)
        filenames.append(filename)

In [None]:
food_counts = (pd.concat(food_counts, axis=1, sort=True)
               .fillna(0).astype(int).T)
food_counts.index = pd.Index(filenames, name='filename')

In [None]:
food_counts.to_csv('IBD_2_file_food_count.csv')