In [1]:
import exmp
import os.path
import qiime2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from qiime2.plugins.diversity.actions import filter_distance_matrix
from qiime2.plugins.longitudinal.actions import first_distances
import scipy.stats

from statsmodels.stats.multitest import multipletests

In [7]:

def microbiome_performance_correlations(project, 
                                        time_column, 
                                        baseline_time_value,
                                        performance_metrics,
                                        week,
                                        sample_metadata,
                                        data_dir,
                                        output_dir,
                                        gender=None):

    results = []
    uu = ("unweighted UniFrac", os.path.join(data_dir, "unweighted_unifrac_distance_matrix.qza"))
    wu = ("weighted UniFrac", os.path.join(data_dir, "weighted_unifrac_distance_matrix.qza"))
    bc = ("Bray-Curtis", os.path.join(data_dir, "bray_curtis_distance_matrix.qza"))
    bj = ("Jaccard", os.path.join(data_dir, "jaccard_distance_matrix.qza"))
    
    where = "[project]='%s' and [exclude]='no'" % project
    if gender is not None:
        where = "%s and [gender]='%s'" % (where, gender)
    else:
        gender = 'mf'
    
    ids_to_keep = sample_metadata.get_ids(where=where)
    sample_metadata = sample_metadata.filter_ids(ids_to_keep=ids_to_keep)

    metadata_to_merge = []
    distance_columns = []

    for metric, dm_fp in [uu, wu, bc, bj]:
        dm = qiime2.Artifact.load(dm_fp)
        dm = filter_distance_matrix(dm, metadata=sample_metadata).filtered_distance_matrix
        # add distances to baseline to sample metadata
        dists_to_baselines = first_distances(distance_matrix=dm, metadata=sample_metadata, state_column=time_column, 
                                             individual_id_column='subject-id', baseline=baseline_time_value, 
                                             replicate_handling='random').first_distances
        dists_to_baselines = dists_to_baselines.view(qiime2.Metadata).get_column('Distance').to_dataframe()
        column_name = '%s distance (%s %d to %s)' % (metric, time_column, baseline_time_value, week)
        dists_to_baselines = dists_to_baselines.rename(columns = {'Distance' : column_name})

        metadata_to_merge.append(qiime2.Metadata(dists_to_baselines))
        distance_columns.append(column_name)

    for e in metadata_to_merge:
        sample_metadata = sample_metadata.merge(e)
    data = sample_metadata.to_dataframe()

    for distance_column in distance_columns:
        for performance_metric in performance_metrics:
            where = "[%s]='%s'" % (time_column, week)
            ids_to_keep = sample_metadata.get_ids(where=where)
            sample_metadata_subsample = sample_metadata.filter_ids(ids_to_keep=ids_to_keep).to_dataframe()
            sample_metadata_subsample = sample_metadata_subsample[[distance_column, performance_metric]].dropna().astype(np.float)
            tau, p = scipy.stats.kendalltau(sample_metadata_subsample[[distance_column, performance_metric]])
            results.append((project, distance_column, performance_metric, tau, p, sample_metadata_subsample.shape[0]))
            fig_fn = '%s-%s-%s-%s.pdf' % (project, distance_column, performance_metric, gender)
            fig_fp = '%s/%s' % (output_dir, fig_fn)
            sns.scatterplot(sample_metadata_subsample[distance_column], 
                            sample_metadata_subsample[performance_metric]).get_figure().savefig(fig_fp)
            plt.clf()
    df = pd.DataFrame(results, columns=['project', 'distance', 'performance metric', 'Spearman rho', 'p-value', 'sample size'])
    df['q-value'] = multipletests(df['p-value'])[1]
    output_fn = '%s-%s-%s.csv' % (project, week, gender) 
    df.to_csv('%s/%s' % (output_dir, output_fn))
    return df

In [8]:
output_dir = '../data/exmp1-and-exmp2/cm/microbiome-performance-correlations/'

df_exmp1 = microbiome_performance_correlations(
    'exmp1', 'week', 1.0, ['RER-change', 'VO2max-change'], 
    '5.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir)
df_exmp2 = microbiome_performance_correlations(
    'exmp2', 'week', 1.0, ['bench-press-change', 'row-change', '3RM-squat-change'], 
    '5.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir)
df_exmp1 = microbiome_performance_correlations(
    'exmp1', 'week', 1.0, ['RER-change', 'VO2max-change'], 
    '6.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir)
df_exmp2 = microbiome_performance_correlations(
    'exmp2', 'week', 1.0, ['bench-press-change', 'row-change', '3RM-squat-change'], 
    '6.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir)

df_exmp1 = microbiome_performance_correlations(
    'exmp1', 'week', 1.0, ['RER-change', 'VO2max-change'], 
    '5.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='m')
df_exmp2 = microbiome_performance_correlations(
    'exmp2', 'week', 1.0, ['bench-press-change', 'row-change', '3RM-squat-change'], 
    '5.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='m')
df_exmp1 = microbiome_performance_correlations(
    'exmp1', 'week', 1.0, ['RER-change', 'VO2max-change'], 
    '6.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='m')
df_exmp2 = microbiome_performance_correlations(
    'exmp2', 'week', 1.0, ['bench-press-change', 'row-change', '3RM-squat-change'], 
    '6.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='m')

df_exmp1 = microbiome_performance_correlations(
    'exmp1', 'week', 1.0, ['RER-change', 'VO2max-change'], 
    '5.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='f')
df_exmp2 = microbiome_performance_correlations(
    'exmp2', 'week', 1.0, ['bench-press-change', 'row-change', '3RM-squat-change'], 
    '5.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='f')
df_exmp1 = microbiome_performance_correlations(
    'exmp1', 'week', 1.0, ['RER-change', 'VO2max-change'], 
    '6.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='f')
df_exmp2 = microbiome_performance_correlations(
    'exmp2', 'week', 1.0, ['bench-press-change', 'row-change', '3RM-squat-change'], 
    '6.0', exmp.load_sample_metadata(), exmp.cm_path, output_dir, gender='f')

TypeError: kendalltau() missing 1 required positional argument: 'y'