diff --git a/.travis.yml b/.travis.yml index cd6b6ea..63ead4f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ env: addons: apt: packages: - - texlive + - texlive-latex-extra before_install: - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - chmod +x miniconda.sh @@ -18,6 +18,11 @@ install: - source activate env_name - pip install -r pip_requirements.txt - pip install -e . --no-deps + +before_script: + - "export DISPLAY=:99.0" + - "sh -e /etc/init.d/xvfb start" + - sleep 3 # gives time to start script: - nosetests --with-doctest - flake8 americangut/*.py diff --git a/americangut/notebook_environment.py b/americangut/notebook_environment.py index 497249e..3d0a862 100644 --- a/americangut/notebook_environment.py +++ b/americangut/notebook_environment.py @@ -638,6 +638,7 @@ # collapsed samples 'collapsed': { '100nt': { + 'alpha-map': '08-collapsed/100nt/alpha_map.txt', '1k': { 'ag-biom': '08-collapsed/100nt/1k/ag.biom', 'ag-fecal-biom': '08-collapsed/100nt/1k/ag-fecal.biom', @@ -697,6 +698,7 @@ } }, 'notrim': { + 'alpha-map': '08-collapsed/notrim/alpha-map.txt', '1k': { 'ag-biom': '08-collapsed/notrim/1k/ag.biom', 'ag-fecal-biom': '08-collapsed/notrim/1k/ag-fecal.biom', @@ -2083,6 +2085,12 @@ }, } +alpha_metrics = {'pd': 'PD_whole_tree', + 'chao1': 'chao1', + 'shannon': 'shannon', + 'observedotus': 'observed_otus' + } + def _assert_environment(): if qiime.__version__ != '1.9.1': diff --git a/americangut/parallel.py b/americangut/parallel.py index 004e12a..8f581ca 100644 --- a/americangut/parallel.py +++ b/americangut/parallel.py @@ -4,6 +4,9 @@ import sys from functools import partial +import matplotlib +matplotlib.use('Agg') # noqa + import americangut as ag import americangut.results_utils as agru import americangut.notebook_environment as agenv diff --git a/americangut/per_sample.py b/americangut/per_sample.py index d7b90ba..0043685 100644 --- a/americangut/per_sample.py +++ b/americangut/per_sample.py @@ -1,12 +1,23 @@ import os +from matplotlib import use, rcParams +use('Agg') # noqa + import biom +import matplotlib.pyplot as plt +import pandas as pd from qiime.util import qiime_system_call +import seaborn as sn import americangut.util as agu import americangut.notebook_environment as agenv import americangut.results_utils as agru +# Sets up plotting parameters so that the default setting is use to Helvetica +# in plots +rcParams['font.family'] = 'sans-serif' +rcParams['font.sans-serif'] = ['Arial'] + def create_opts(sample_type, chp_path, gradient_color_by, barchart_categories): """Create a dict of options for processing functions @@ -247,6 +258,66 @@ def taxa_summaries(opts, sample_ids): return results +def alpha_plot(opts, sample_ids): + """Produces digestable alpha diversity distribution plots per sample + + Parameters + ---------- + opts : dict + A dict of relevant opts. + sample_ids : Iterable of str + A list of sample IDs of interest + + Returns + ------- + dict + A dict containing each sample ID and any errors observed or None if + no error was observed for the sample. {str: str or None} + """ + + results = {} + alpha_map = pd.read_csv( + agu.get_existing_path(opts['collapsed']['100nt']['alpha-map']), + sep='\t', + dtype=str, + ) + + alpha_metrics = ['shannon_1k', 'PD_whole_tree_1k'] + + # Checks the alpha_field is in the mapping file + for metric in alpha_metrics: + if metric not in alpha_map.columns: + raise ValueError('%s is not a valid alpha diversity field name.' + % metric) + # Checks the group_field is in the mapping file + if 'SIMPLE_BODY_SITE' not in alpha_map.columns: + raise ValueError('SIMPLE_BODY_SITE is not a valid field name.') + + alpha_map[alpha_metrics] = alpha_map[alpha_metrics].astype(float) + alpha_map.set_index('#SampleID', inplace=True) + + results = {} + for id_ in sample_ids: + if id_ not in alpha_map.index: + results[id_] = 'ID not found' + else: + results[id_] = None + shannon_path = os.path.join(_result_path(opts, id_), + 'shannon_%s.pdf' % id_) + _plot_alpha(id_, alpha_map, 'shannon_1k', + xlabel='Shannon Diversity', + fp=shannon_path) + + # Generates the pd whole tree diversity figure + pd_path = os.path.join(_result_path(opts, id_), + 'pd_%s.pdf' % id_) + _plot_alpha(id_, alpha_map, 'PD_whole_tree_1k', + xlabel='PD Whole Tree Diversity', + fp=pd_path) + + return results + + def sufficient_sequence_counts(opts, sample_ids): """Errors if the sequence counts post filtering are < 1000 @@ -515,3 +586,112 @@ def stage_per_sample_specific_statics(opts, sample_ids): result[id_] = "Cannot symlink for statics." return result + + +def _plot_alpha(sample, alpha_map, alpha_field, group_field='SIMPLE_BODY_SITE', + output_dir=None, xlabel=None, fp=None, debug=False): + """Generates a distrbution plot for the data + + Parameters + ---------- + sample : str + The sample ID to be plotted + alpha_map_fp : pandas DataFrame + A pandas dataframe containing the sample metadata. The sample ID + should be given in the `'#SampleID'` column, a column with + the name given by `alpha_field` contains alpha diversity values, + and the `group_field` column specifying the groups which should be + used to seperate the data for making the distribution plot. + alpha_field : str + The name of the column in `alpha_map` which includes the alpha + diversity values. + group_field : str + Default is 'SIMPLE_BODY_SITE'. The name of the column in `alpha_map` + which provides the grouping for generating distribution plots. + output_dir : str + The location where the alpha diversity figures should be saved. + xlabel : str + Text describing the quantity on the x-axis. + + Returns + ------- + If the sample is present, a matplotlib figure with the alpha diversity + distribution and a line indicating the sample value is returned. If a + file path is specified, the figure will be saved at the filepath instead + of returning. + + If debug is passed, the following parameters are returned: + group : str + The value of the `group_field` for the sample + group_alpha : ndarray + The alpha diversity values associated with the group + sample_alpha : float + The alpha diversity for the sample + xlabel : str + The label used for the x-axis of the plot. + + """ + + # Explicitly casts the alpha diversity to a float + alpha_map[alpha_field] = alpha_map[alpha_field].astype(float) + + # Draws the observations and group + group = alpha_map.loc[sample, group_field] + group_alpha = alpha_map.loc[alpha_map[group_field] == group, alpha_field] + sample_alpha = alpha_map.loc[sample, alpha_field] + + if xlabel is None: + xlabel = '%sdiversity' % alpha_field.split('1')[0].replace('_', ' ') + + if debug: + return group, group_alpha, sample_alpha, xlabel + + # Defines the group color. This is currently hardcoded, although the + # longer term plan is to substitute in function which will define the color + # based on the relationship between the sample and a yet to be written + # predicted value. + group_color = '#1f78b4' + sample_color = '#525252' + + with sn.axes_style('ticks', {'axes.facecolor': 'none'}): + # Sets up the axis for plotting + ax = plt.axes() + + # Plots the distribution + sn.kdeplot(group_alpha, + ax=ax, + legend=False, + color=group_color) + ylim = ax.get_ylim() + + # Plots the individual line + ax.plot([sample_alpha, sample_alpha], [-1, 1], color=sample_color) + + # Returns the y-limits to the original value and removes the ticks + ax.set_ylim(ylim) + ax.set_yticks([]) + # Removes the spine + sn.despine(offset=5, trim=True, top=True, left=True, right=True) + # Updates the xticks to match the correct font + ax.set_xticklabels(map(int, ax.get_xticks()), size=11) + ax.set_xlabel(xlabel, size=13) + + # Adds text describing the sample + ax.text(x=ax.get_xticks().max(), + y=ax.get_ylim()[1]*0.85, + s='Your Sample:\t%1.1f\nAverage:\t%1.1f' + % (sample_alpha, group_alpha.mean()), + ha='right', + size=11, + ) + + # Sets the figure size + fig = ax.figure + fig.set_size_inches((5, 2.5)) + ax.set_position((0.125, 0.375, 0.75, 0.5)) + + if fp is None: + return fig + else: + fig.savefig(fp, dpi=300) + fig.clear() diff --git a/ipynb/primary-processing/08-collapse_samples.md b/ipynb/primary-processing/08-collapse_samples.md index fb00c95..3164682 100644 --- a/ipynb/primary-processing/08-collapse_samples.md +++ b/ipynb/primary-processing/08-collapse_samples.md @@ -3,6 +3,8 @@ Some of the per-results figures require various slices and perspectives of the d ```python >>> import os ... +>>> import pandas as pd +... >>> import americangut.notebook_environment as agenv >>> import americangut.util as agu ... @@ -17,23 +19,46 @@ Let's make sure we have the paths we need. >>> ag_cleaned_md = agu.get_existing_path(agenv.paths['meta']['ag-cleaned-md']) ``` -First, we're going to operate on rarefied data again. +We're going to start by generating per-bodysite mapping files with alpha diversity attached. ```python >>> low_depth, high_depth = agenv.get_rarefaction_depth() ``` +```python +>>> for trim in ['ag-pgp-hmp-gg-100nt', 'ag-notrim']: +... trimpath = os.path.join(chp_path, trim.split('-')[-1]) +... alpha = {} +... if not os.path.exists(trimpath): +... os.mkdir(trimpath) +... for depth, rarefaction in zip([low_depth, high_depth], ['1k', '10k']): +... rarepath = os.path.join(trimpath, rarefaction) +... if not os.path.exists(rarepath): +... os.mkdir(rarepath) +... for met in agenv.alpha_metrics.keys(): +... alpha_fp = agu.get_existing_path( +... agenv.paths['alpha'][rarefaction]['%s-%s' % (trim, met)] +... ) +... alpha_df = pd.read_csv(alpha_fp, sep='\t', dtype=str).set_index('Unnamed: 0').transpose() +... alpha['%s_%s' % (agenv.alpha_metrics[met], rarefaction)] = alpha_df +... map_ = pd.read_csv(ag_cleaned_md, sep='\t', dtype=str).set_index('#SampleID') +... alpha_map = agu.add_alpha_diversity(map_, alpha) +... alpha_map.to_csv( +... agu.get_new_path(agenv.paths['collapsed'][trim.split('-')[-1]]['alpha-map']), +... sep='\t', +... index_label='#SampleID' +... ) +``` + +Next, we're going to operate on rarefied data again. + ```python >>> def rarefaction_parameters(): ... for trim in ['100nt', 'notrim']: ... trimpath = os.path.join(chp_path, trim) -... if not os.path.exists(trimpath): -... os.mkdir(trimpath) ... ... for depth, rarefaction in zip([low_depth, high_depth], ['1k', '10k']): ... rarepath = os.path.join(trimpath, rarefaction) -... if not os.path.exists(rarepath): -... os.mkdir(rarepath) ... ... table = agu.get_existing_path(agenv.paths['otus'][trim]['ag-biom']) ... output = agu.get_new_path(agenv.paths['collapsed'][trim][rarefaction]['ag-biom']) @@ -48,7 +73,7 @@ First, we're going to operate on rarefied data again. ... -d $depth ``` -Next, we're going to partition the data into per-body site tables. +Then, we're going to partition the data into per-body site tables. ```python >>> def filter_parameters(): diff --git a/ipynb/primary-processing/09-per_participant_results.md b/ipynb/primary-processing/09-per_participant_results.md index c38974b..4ede414 100644 --- a/ipynb/primary-processing/09-per_participant_results.md +++ b/ipynb/primary-processing/09-per_participant_results.md @@ -3,6 +3,9 @@ Now that we've done all the bulk processing, let's generate the per-sample resul ```python >>> import os >>> from functools import partial +... +>>> from matplotlib import use +>>> use('Agg') >>> import pandas as pd ... >>> import americangut as ag @@ -52,10 +55,12 @@ And finally, these next blocks of code support the per-sample type processing. F ... agps.per_sample_directory, ... agps.stage_per_sample_specific_statics, ... agps.taxa_summaries, +... agps.alpha_plot, ... agps.taxon_significance, ... agps.body_site_pcoa, ... agps.gradient_pcoa, -... agps.bar_chart] +... agps.bar_chart, +... ] ... >>> fecal_functions = common_functions + [agps.country_pcoa] >>> oral_functions = common_functions + [agps.pie_plot] @@ -89,9 +94,9 @@ And now, let's start mass generating figures! ```python >>> site_to_functions = [('FECAL', process_fecal), ... ('ORAL', process_oral), -... ('SKIN', process_skin)] +... ('SKIN', process_skin) +... ] >>> partitions = agps.partition_samples_by_bodysite(ag_cleaned_df, site_to_functions) -... >>> with open(successful_ids, 'w') as successful_ids_fp, open(unsuccessful_ids, 'w') as unsuccessful_ids_fp: ... agpar.dispatcher(successful_ids_fp, unsuccessful_ids_fp, partitions) ``` diff --git a/tests/files/test_mapping_alpha.txt b/tests/files/test_mapping_alpha.txt new file mode 100644 index 0000000..282e0f4 --- /dev/null +++ b/tests/files/test_mapping_alpha.txt @@ -0,0 +1 @@ +#SampleID TEST_CATEGORY AWESOME_CATEGORY PD_whole_tree_1k shannon_1k sample_a 1 super 5 2 sample_b 2 totally 12 4 \ No newline at end of file diff --git a/tests/test_per_sample.py b/tests/test_per_sample.py index d21c01c..53d369a 100644 --- a/tests/test_per_sample.py +++ b/tests/test_per_sample.py @@ -2,9 +2,11 @@ import shutil from unittest import TestCase, main +import numpy as np import pandas as pd import numpy.testing as npt +import americangut as ag import americangut.notebook_environment as agenv import americangut.per_sample as agps @@ -208,6 +210,62 @@ def test_stage_per_sample_specific_statics(self): obs = agps.stage_per_sample_specific_statics(opts, ids) self.assertEqual(obs, exp) + def test_plot_alpha(self): + map_ = pd.DataFrame( + data=np.array([ + ['skin', '1990', 'female', 'Verity', 'US', 12.5], + ['fecal', '1990', 'female', 'Verity', 'US', 8.6], + ['fecal', '1987', 'male', 'Alex', 'US', 7.9], + ['fecal', '1993', 'female', 'Annie', 'US', 7.5], + ['skin', '1989', 'male', 'Dominic', 'UK', 14.0], + ['fecal', '1986', 'female', 'Sarah', 'US', 15.0], + ['oral', '1988', 'female', 'Shelby', 'AUS', 4.2], + ]), + index=['VeP0', 'VeP1', 'AxP0', 'AnP0', 'DoD0', 'SaZ0', 'ShT0'], + columns=['SIMPLE_BODY_SITE', 'BIRTH_YEAR', 'SEX', + 'HOST_SUBJECT_ID', 'NATIONALITY', 'alpha'], + ) + sample = 'VeP0' + kgroup = 'skin' + kgalpha = pd.Series([12.5, 14.0], + index=['VeP0', 'DoD0'], + name='alpha', + dtype=object) + ksalpha = 12.5 + kxlabel = 'alphadiversity' + + (tgroup, tgalpha, tsalpha, txlabel) = \ + agps._plot_alpha(sample=sample, + alpha_map=map_, + alpha_field='alpha', + debug=True) + + self.assertEqual(tgroup, kgroup) + npt.assert_array_equal(tgalpha.values, kgalpha.values) + self.assertEqual(tsalpha, ksalpha) + self.assertEqual(txlabel, kxlabel) + + def test_alpha_plot_metric_error_field_error(self): + opts = {'collapsed': { + '100nt': { + 'alpha-map': + os.path.join(ag.WORKING_DIR.split('American-Gut')[0], + 'American-Gut/tests/files/' + 'test_mapping.txt')}}, + 'sample_type': 'fecal'} + with self.assertRaises(ValueError): + agps.alpha_plot(opts, ['sample_a', 'sample_b']) + + def test_alpha_plot_group_field_error(self): + opts = {'collapsed': { + '100nt': { + 'alpha-map': + os.path.join(ag.WORKING_DIR.split('American-Gut')[0], + 'American-Gut/tests/files/' + 'test_mapping_alpha.txt')}}, + 'sample_type': 'fecal'} + with self.assertRaises(ValueError): + agps.alpha_plot(opts, ['sample_a', 'sample_b']) if __name__ == '__main__': main()