biocore · mortonjt · Jan 20, 2016 · Dec 22, 2015 · Dec 30, 2015 · Dec 30, 2015
diff --git a/.travis.yml b/.travis.yml
@@ -5,7 +5,7 @@ env:
 addons:
     apt:
         packages:
-            - texlive
+            - texlive-latex-extra
 before_install:
   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
   - chmod +x miniconda.sh
@@ -18,6 +18,11 @@ install:
   - source activate env_name
   - pip install -r pip_requirements.txt
   - pip install -e . --no-deps
+
+before_script:
+  - "export DISPLAY=:99.0"
+  - "sh -e /etc/init.d/xvfb start"
+  - sleep 3 # gives time to start
 script:
   - nosetests --with-doctest
   - flake8 americangut/*.py

diff --git a/americangut/notebook_environment.py b/americangut/notebook_environment.py
@@ -638,6 +638,7 @@
     # collapsed samples
     'collapsed': {
         '100nt': {
+            'alpha-map': '08-collapsed/100nt/alpha_map.txt',
             '1k': {
                 'ag-biom': '08-collapsed/100nt/1k/ag.biom',
                 'ag-fecal-biom': '08-collapsed/100nt/1k/ag-fecal.biom',
@@ -697,6 +698,7 @@
                 }
             },
         'notrim': {
+            'alpha-map': '08-collapsed/notrim/alpha-map.txt',
             '1k': {
                 'ag-biom': '08-collapsed/notrim/1k/ag.biom',
                 'ag-fecal-biom': '08-collapsed/notrim/1k/ag-fecal.biom',
@@ -2083,6 +2085,12 @@
         },
     }
 
+alpha_metrics = {'pd': 'PD_whole_tree',
+                 'chao1': 'chao1',
+                 'shannon': 'shannon',
+                 'observedotus': 'observed_otus'
+                 }
+
 
 def _assert_environment():
     if qiime.__version__ != '1.9.1':

diff --git a/americangut/parallel.py b/americangut/parallel.py
@@ -4,6 +4,9 @@
 import sys
 from functools import partial
 
+import matplotlib
+matplotlib.use('Agg')  # noqa
+
 import americangut as ag
 import americangut.results_utils as agru
 import americangut.notebook_environment as agenv

diff --git a/americangut/per_sample.py b/americangut/per_sample.py
@@ -1,12 +1,23 @@
 import os
 
+from matplotlib import use, rcParams
+use('Agg')  # noqa
+
 import biom
+import matplotlib.pyplot as plt
+import pandas as pd
 from qiime.util import qiime_system_call
+import seaborn as sn
 
 import americangut.util as agu
 import americangut.notebook_environment as agenv
 import americangut.results_utils as agru
 
+# Sets up plotting parameters so that the default setting is use to Helvetica
+# in plots
+rcParams['font.family'] = 'sans-serif'
+rcParams['font.sans-serif'] = ['Arial']
+
 
 def create_opts(sample_type, chp_path, gradient_color_by, barchart_categories):
     """Create a dict of options for processing functions
@@ -247,6 +258,66 @@ def taxa_summaries(opts, sample_ids):
     return results
 
 
+def alpha_plot(opts, sample_ids):
+    """Produces digestable alpha diversity distribution plots per sample
+
+    Parameters
+    ----------
+    opts : dict
+        A dict of relevant opts.
+    sample_ids : Iterable of str
+        A list of sample IDs of interest
+
+    Returns
+    -------
+    dict
+        A dict containing each sample ID and any errors observed or None if
+        no error was observed for the sample. {str: str or None}
+    """
+
+    results = {}
+    alpha_map = pd.read_csv(
+        agu.get_existing_path(opts['collapsed']['100nt']['alpha-map']),
+        sep='\t',
+        dtype=str,
+        )
+
+    alpha_metrics = ['shannon_1k', 'PD_whole_tree_1k']
+
+    # Checks the alpha_field is in the mapping file
+    for metric in alpha_metrics:
+        if metric not in alpha_map.columns:
+            raise ValueError('%s is not a valid alpha diversity field name.'
+                             % metric)
+    # Checks the group_field is in the mapping file
+    if 'SIMPLE_BODY_SITE' not in alpha_map.columns:
+        raise ValueError('SIMPLE_BODY_SITE is not a valid field name.')
+
+    alpha_map[alpha_metrics] = alpha_map[alpha_metrics].astype(float)
+    alpha_map.set_index('#SampleID', inplace=True)
+
+    results = {}
+    for id_ in sample_ids:
+        if id_ not in alpha_map.index:
+            results[id_] = 'ID not found'
+        else:
+            results[id_] = None
+            shannon_path = os.path.join(_result_path(opts, id_),
+                                        'shannon_%s.pdf' % id_)
+            _plot_alpha(id_, alpha_map, 'shannon_1k',
+                        xlabel='Shannon Diversity',
+                        fp=shannon_path)
+
+            # Generates the pd whole tree diversity figure
+            pd_path = os.path.join(_result_path(opts, id_),
+                                   'pd_%s.pdf' % id_)
+            _plot_alpha(id_, alpha_map, 'PD_whole_tree_1k',
+                        xlabel='PD Whole Tree Diversity',
+                        fp=pd_path)
+
+    return results
+
+
 def sufficient_sequence_counts(opts, sample_ids):
     """Errors if the sequence counts post filtering are < 1000
 
@@ -515,3 +586,112 @@ def stage_per_sample_specific_statics(opts, sample_ids):
             result[id_] = "Cannot symlink for statics."
 
     return result
+
+
+def _plot_alpha(sample, alpha_map, alpha_field, group_field='SIMPLE_BODY_SITE',
+                output_dir=None, xlabel=None, fp=None, debug=False):
+    """Generates a distrbution plot for the data
+
+    Parameters
+    ----------
+    sample : str
+        The sample ID to be plotted
+    alpha_map_fp : pandas DataFrame
+        A pandas dataframe containing the sample metadata. The sample ID
+        should be given in the `'#SampleID'` column, a column with
+        the name given by `alpha_field` contains alpha diversity values,
+        and the `group_field` column specifying the groups which should be
+        used to seperate the data for making the distribution plot.
+    alpha_field : str
+        The name of the column in `alpha_map` which includes the alpha
+        diversity values.
+    group_field : str
+        Default is 'SIMPLE_BODY_SITE'. The name of the column in `alpha_map`
+        which provides the grouping for generating distribution plots.
+    output_dir : str
+        The location where the alpha diversity figures should be saved.
+    xlabel : str
+        Text describing the quantity on the x-axis.
+
+    Returns
+    -------
+    If the sample is present, a matplotlib figure with the alpha diversity
+    distribution and a line indicating the sample value is returned. If a
+    file path is specified, the figure will be saved at the filepath instead
+    of returning.
+
+    If debug is passed, the following parameters are returned:
+        group : str
+            The value of the `group_field` for the sample
+        group_alpha : ndarray
+            The alpha diversity values associated with the group
+        sample_alpha : float
+            The alpha diversity for the sample
+        xlabel : str
+            The label used for the x-axis of the plot.
+
+    """
+
+    # Explicitly casts the alpha diversity to a float
+    alpha_map[alpha_field] = alpha_map[alpha_field].astype(float)
+
+    # Draws the observations and group
+    group = alpha_map.loc[sample, group_field]
+    group_alpha = alpha_map.loc[alpha_map[group_field] == group, alpha_field]
+    sample_alpha = alpha_map.loc[sample, alpha_field]
+
+    if xlabel is None:
+        xlabel = '%sdiversity' % alpha_field.split('1')[0].replace('_', ' ')
+
+    if debug:
+        return group, group_alpha, sample_alpha, xlabel
+
+    # Defines the group color. This is currently hardcoded, although the
+    # longer term plan is to substitute in function which will define the color
+    # based on the relationship between the sample and a yet to be written
+    # predicted value.
+    group_color = '#1f78b4'
+    sample_color = '#525252'
+
+    with sn.axes_style('ticks', {'axes.facecolor': 'none'}):
+        # Sets up the axis for plotting
+        ax = plt.axes()
+
+        # Plots the distribution
+        sn.kdeplot(group_alpha,
+                   ax=ax,
+                   legend=False,
+                   color=group_color)
+        ylim = ax.get_ylim()
+
+        # Plots the individual line
+        ax.plot([sample_alpha, sample_alpha], [-1, 1], color=sample_color)
+
+        # Returns the y-limits to the original value and removes the ticks
+        ax.set_ylim(ylim)
+        ax.set_yticks([])
+        # Removes the spine
+        sn.despine(offset=5, trim=True, top=True, left=True, right=True)
+        # Updates the xticks to match the correct font
+        ax.set_xticklabels(map(int, ax.get_xticks()), size=11)
+        ax.set_xlabel(xlabel, size=13)
+
+        # Adds text describing the sample
+        ax.text(x=ax.get_xticks().max(),
+                y=ax.get_ylim()[1]*0.85,
+                s='Your Sample:\t%1.1f\nAverage:\t%1.1f'
+                % (sample_alpha, group_alpha.mean()),
+                ha='right',
+                size=11,
+                )
+
+    # Sets the figure size
+    fig = ax.figure
+    fig.set_size_inches((5, 2.5))
+    ax.set_position((0.125, 0.375, 0.75, 0.5))
+
+    if fp is None:
+        return fig
+    else:
+        fig.savefig(fp, dpi=300)
+        fig.clear()
diff --git a/ipynb/primary-processing/08-collapse_samples.md b/ipynb/primary-processing/08-collapse_samples.md
@@ -3,6 +3,8 @@ Some of the per-results figures require various slices and perspectives of the d
 ```python
 >>> import os
 ...
+>>> import pandas as pd
+...
 >>> import americangut.notebook_environment as agenv
 >>> import americangut.util as agu
 ...
@@ -17,23 +19,46 @@ Let's make sure we have the paths we need.
 >>> ag_cleaned_md  = agu.get_existing_path(agenv.paths['meta']['ag-cleaned-md'])
 ```
 
-First, we're going to operate on rarefied data again.
+We're going to start by generating per-bodysite mapping files with alpha diversity attached.
 
 ```python
 >>> low_depth, high_depth = agenv.get_rarefaction_depth()
 ```
 
+```python
+>>> for trim in ['ag-pgp-hmp-gg-100nt', 'ag-notrim']:
+...     trimpath = os.path.join(chp_path, trim.split('-')[-1])
+...     alpha = {}
+...     if not os.path.exists(trimpath):
+...         os.mkdir(trimpath)
+...     for depth, rarefaction in zip([low_depth, high_depth], ['1k', '10k']):
+...         rarepath = os.path.join(trimpath, rarefaction)
+...         if not os.path.exists(rarepath):
+...             os.mkdir(rarepath)
+...         for met in agenv.alpha_metrics.keys():
+...             alpha_fp = agu.get_existing_path(
+...                 agenv.paths['alpha'][rarefaction]['%s-%s' % (trim, met)]
+...             )
+...             alpha_df = pd.read_csv(alpha_fp, sep='\t', dtype=str).set_index('Unnamed: 0').transpose()
+...             alpha['%s_%s' % (agenv.alpha_metrics[met], rarefaction)] = alpha_df
+...     map_ = pd.read_csv(ag_cleaned_md, sep='\t', dtype=str).set_index('#SampleID')
+...     alpha_map = agu.add_alpha_diversity(map_, alpha)
+...     alpha_map.to_csv(
+...         agu.get_new_path(agenv.paths['collapsed'][trim.split('-')[-1]]['alpha-map']),
+...         sep='\t',
+...         index_label='#SampleID'
+...     )
+```
+
+Next, we're going to operate on rarefied data again.
+
 ```python
 >>> def rarefaction_parameters():
 ...     for trim in ['100nt', 'notrim']:
 ...         trimpath = os.path.join(chp_path, trim)
-...         if not os.path.exists(trimpath):
-...             os.mkdir(trimpath)
 ...
 ...         for depth, rarefaction in zip([low_depth, high_depth], ['1k', '10k']):
 ...             rarepath = os.path.join(trimpath, rarefaction)
-...             if not os.path.exists(rarepath):
-...                 os.mkdir(rarepath)
 ...
 ...             table  = agu.get_existing_path(agenv.paths['otus'][trim]['ag-biom'])
 ...             output = agu.get_new_path(agenv.paths['collapsed'][trim][rarefaction]['ag-biom'])
@@ -48,7 +73,7 @@ First, we're going to operate on rarefied data again.
 ...                            -d $depth
 ```
 
-Next, we're going to partition the data into per-body site tables.
+Then, we're going to partition the data into per-body site tables.
 
 ```python
 >>> def filter_parameters():

diff --git a/ipynb/primary-processing/09-per_participant_results.md b/ipynb/primary-processing/09-per_participant_results.md
@@ -3,6 +3,9 @@ Now that we've done all the bulk processing, let's generate the per-sample resul
 ```python
 >>> import os
 >>> from functools import partial
+...
+>>> from matplotlib import use
+>>> use('Agg')
 >>> import pandas as pd
 ...
 >>> import americangut as ag
@@ -52,10 +55,12 @@ And finally, these next blocks of code support the per-sample type processing. F
 ...                     agps.per_sample_directory,
 ...                     agps.stage_per_sample_specific_statics,
 ...                     agps.taxa_summaries,
+...                     agps.alpha_plot,
 ...                     agps.taxon_significance,
 ...                     agps.body_site_pcoa,
 ...                     agps.gradient_pcoa,
-...                     agps.bar_chart]
+...                     agps.bar_chart,
+...                    ]
 ...
 >>> fecal_functions = common_functions + [agps.country_pcoa]
 >>> oral_functions  = common_functions + [agps.pie_plot]
@@ -89,9 +94,9 @@ And now, let's start mass generating figures!
 ```python
 >>> site_to_functions = [('FECAL', process_fecal),
 ...                      ('ORAL', process_oral),
-...                      ('SKIN', process_skin)]
+...                      ('SKIN', process_skin)
+...                     ]
 >>> partitions = agps.partition_samples_by_bodysite(ag_cleaned_df, site_to_functions)
-...
 >>> with open(successful_ids, 'w') as successful_ids_fp, open(unsuccessful_ids, 'w') as unsuccessful_ids_fp:
 ...     agpar.dispatcher(successful_ids_fp, unsuccessful_ids_fp, partitions)
 ```

diff --git a/tests/files/test_mapping_alpha.txt b/tests/files/test_mapping_alpha.txt
@@ -0,0 +1 @@
+#SampleID	TEST_CATEGORY	AWESOME_CATEGORY	PD_whole_tree_1k	shannon_1ksample_a	1	super	5	2sample_b	2	totally	12	4
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		#SampleID TEST_CATEGORY AWESOME_CATEGORY PD_whole_tree_1k shannon_1ksample_a 1 super 5 2sample_b 2 totally 12 4
Expand Down