In [None]:
import math

import numpy as np
import pandas as pd

### GFOP sample type metadata 

In [None]:
def get_sample_types(gfop_metadata, simple_complex=None):
    if simple_complex is not None:
        gfop_metadata = gfop_metadata[
            gfop_metadata['simple_complex'] == simple_complex]
    col_sample_types = [f'sample_type_group{i}' for i in range(1, 7)]
    return (gfop_metadata[['filename', *col_sample_types]]
            .set_index('filename'))

In [None]:
gfop_metadata = pd.read_csv(
    '../data/11442_foodomics_multiproject_metadata.txt', sep='\t')
# First row is empty.
gfop_metadata = gfop_metadata.drop(index=0)
# Remove trailing whitespace.
gfop_metadata = gfop_metadata.apply(lambda col: col.str.strip()
                                    if col.dtype == 'object' else col)

### Investigate coffee/tea consumption

In [None]:
def get_file_food_counts(gnps_network, sample_types, groups_included,
                         filenames_included, level):
    # Select GNPS job groups.
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set(groups_included)
    df_selected = gnps_network[
        (gnps_network[groups_included] > 0).all(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    df_selected = df_selected[
        df_selected['UniqueFileSources'].apply(lambda cluster_fn:
            any(fn in cluster_fn for fn in filenames_included))]
    filenames = df_selected['UniqueFileSources'].str.split('|').explode()
    # Select food hierarchy levels.
    sample_types = sample_types[f'sample_type_group{level}']
    # Match the GNPS job results to the food sample types.
    sample_types_selected = sample_types.reindex(filenames)
    sample_types_selected = sample_types_selected.dropna()
    # Discard samples that occur less frequent than water (blank).
    water_count = (sample_types_selected == 'water').sum()
    sample_counts = sample_types_selected.value_counts()
    sample_counts_valid = sample_counts.index[sample_counts > water_count]
    sample_types_selected = sample_types_selected[
        sample_types_selected.isin(sample_counts_valid)]
    # Get sample counts at the specified level.
    counts = sample_types_selected.value_counts()
    counts = counts.append(pd.Series(
        {'caffeine': (df_selected['LibraryID'].str.lower()
                      .str.contains('caffeine').sum()),
         'theophylline': (df_selected['LibraryID'].str.lower()
                          .str.contains('theophylline').sum())}))
    return counts

In [None]:
def get_cooccurring_foods(gnps_network, sample_types, groups_included,
                          cooccurring_foods, level):
    # Select GNPS job groups.
    groups_included = ['G2', 'G4']
    groups = {f'G{i}' for i in range(1, 7)}
    groups_excluded = groups - set(groups_included)
    df_selected = gnps_network[
        (gnps_network[groups_included] > 0).all(axis=1) &
        (gnps_network[groups_excluded] == 0).all(axis=1)].copy()
    # Select food hierarchy levels.
    sample_types = sample_types[f'sample_type_group{level}']
    # Find food types for filenames that co-occur with the specified foods.
    filenames_cooccurring = sample_types.reindex(
        df_selected['UniqueFileSources'].str.split('|').explode().unique())
    filenames_cooccurring = filenames_cooccurring[
        filenames_cooccurring.isin(cooccurring_foods)].index
    filenames_cooccurring = df_selected[
        df_selected['UniqueFileSources'].str.contains(
            '|'.join(filenames_cooccurring))]['UniqueFileSources']
    return sample_types.reindex(filenames_cooccurring.str.split('|')
                                .explode().unique()).dropna().sort_values()

In [None]:
sample_types_simple = get_sample_types(gfop_metadata, 'simple')
sample_types_complex = get_sample_types(gfop_metadata, 'complex')

In [None]:
sample_type_hierarchy = (
    pd.read_csv('../data/sample_type_hierarchy.csv')
    .set_index('descriptor').sort_values('order_num'))

In [None]:
gnps_network = pd.read_csv(
    '../data/3_22_ONR Fecal match - ONR Plasma match - ONR Food - FoodOmics 3500 (no ONR) FDR 0.01/'
    'METABOLOMICS-SNETS-V2-9a90bd12-view_all_clusters_withID_beta-main.tsv',
    sep='\t')

In [None]:
level = 5
groups = ['G2', 'G4']
sample = 'plasma'

In [None]:
metadata = pd.read_csv(
    f'../data/3_22_ONR Fecal match - ONR Plasma match - ONR Food - FoodOmics 3500 (no ONR) FDR 0.01/'
    f'onr_{sample}_metadata.txt', sep='\t')

In [None]:
food_counts, index = [], []
for timepoint, filenames in (metadata.groupby('Study_DayAsReported')
                             ['filename'].agg(list).items()):
    file_food_counts = get_file_food_counts(
        gnps_network, sample_types_simple, groups, filenames, level)
    if len(file_food_counts) > 0:
        food_counts.append(file_food_counts)
        index.append(timepoint)

food_counts = (pd.concat(food_counts, axis=1, sort=True)
               .fillna(0).astype(int).T)
food_counts.index = pd.Index(index, name='day')
food_counts = food_counts[['caffeine', 'theophylline'] +
                          [col for col in food_counts.columns
                           if col not in ['caffeine', 'theophylline']]]
food_counts.to_csv(f'onr_{sample}_timepoints.csv')

In [None]:
food_counts, index = [], []
for timepoint, filenames in (metadata.groupby('Study_DayAsReported')
                             ['filename'].agg(list).items()):
    for filename in filenames:
        file_food_counts = get_file_food_counts(
            gnps_network, sample_types_simple, groups, [filename], level)
        if len(file_food_counts) > 0:
            food_counts.append(file_food_counts)
            index.append((filename, timepoint))

food_counts = (pd.concat(food_counts, axis=1, sort=True)
               .fillna(0).astype(int).T)
food_counts['filename'] = [filename for filename, timepoint in index]
food_counts['day'] = [timepoint for filename, timepoint in index]
food_counts = food_counts.set_index(['filename', 'day'])
food_counts = food_counts[['caffeine', 'theophylline'] +
                          [col for col in food_counts.columns
                           if col not in ['caffeine', 'theophylline']]]
food_counts.to_csv(f'onr_{sample}_timepoints_filename.csv')

In [None]:
cooccurring_coffee = get_cooccurring_foods(
    gnps_network, sample_types_simple, groups, ['coffee'], level)
cooccurring_coffee.to_csv(f'onr_{sample}_cooccurring_coffee.csv',
                          header=False)