From 155444f8031d57576b2349843cc01a4211f913c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= <yoshiki89@gmail.com>
Date: Mon, 22 Sep 2014 14:41:46 -0600
Subject: [PATCH] ENH: Fix problem with --colory_by

When nothing was passed in to --color_by, the default behaviour was to
keep all the columns in the mapping file. This was inconsistent with the
documented behavior, to remove unique columns, which has now been
corrected. See the comments in this patch.

Fixes #271
---
 emperor/util.py         | 31 +++++++++++++++++++++++++++----
 scripts/make_emperor.py |  2 +-
 tests/test_util.py      | 18 ++++++++++++++++++
 3 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/emperor/util.py b/emperor/util.py
index 5ba92802..3ccb1e6e 100644
--- a/emperor/util.py
+++ b/emperor/util.py
@@ -134,7 +134,8 @@ def preprocess_mapping_file(data, headers, columns, unique=False, single=False,
     # process concatenated columns if needed
     merge = []
     for column in columns:
-        if '&&' in column:
+        # the list can contain None so check "if column" before treating as str
+        if column and '&&' in column:
             merge.append(column)
     # each element needs several columns to be merged
     for new_column in merge:
@@ -152,16 +153,38 @@ def preprocess_mapping_file(data, headers, columns, unique=False, single=False,
         columns_to_remove = []
         metadata = MetadataMap(mapping_file_to_dict(data, headers), [])
 
+        # the --coloy_by option in the script interface allows the user to
+        # specify the categories you want to use in the generated plot, this
+        # the default behaviour is to color by all categories that are not
+        # unique. If the user specifies a category with with the --color_by
+        # option and this category contains a unique values, this category must
+        # still be added thus the structure of the next few lines that
+        # form the structure for the two different routes. (1) where no value
+        # is specified in the CLI (the value of columns will be [None, x1, x2,
+        # x3] where x{1,2,3} are categories requested in other CLI options) and
+        # (2) where a value is specified in the CLI.
+        #
+        # TL;DR
+        # see https://github.com/biocore/emperor/issues/271
+        if None in columns:
+            columns = headers[:]
+            f_unique = metadata.hasUniqueCategoryValues
+            f_single = metadata.hasSingleCategoryValue
+        else:
+            f_unique = lambda x: metadata.hasUniqueCategoryValues(x) and\
+                                 x not in columns
+            f_single = lambda x: metadata.hasSingleCategoryValue(x) and\
+                                 x not in columns
+
         # find columns that have values that are all unique
         if unique:
             for c in headers[1::]:
-                if metadata.hasUniqueCategoryValues(c) and c not in columns:
+                if f_unique(c):
                     columns_to_remove.append(c)
-
         # remove categories where there is only one value
         if single:
             for c in headers[1::]:
-                if metadata.hasSingleCategoryValue(c) and c not in columns:
+                if f_single(c):
                     columns_to_remove.append(c)
         columns_to_remove = list(set(columns_to_remove))
 
diff --git a/scripts/make_emperor.py b/scripts/make_emperor.py
index bb1ce1e0..9e780f22 100755
--- a/scripts/make_emperor.py
+++ b/scripts/make_emperor.py
@@ -500,7 +500,7 @@ def main():
                 offending_fields.append(col)
     else:
         # if the user didn't specify the header names display everything
-        color_by_column_names = header[:]
+        color_by_column_names = [None]
 
     # extract a list of the custom axes provided and each element is numeric
     if custom_axes:
diff --git a/tests/test_util.py b/tests/test_util.py
index 877aaebc..9de40fe9 100755
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -178,6 +178,14 @@ def test_preprocess_mapping_file(self):
             'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description'])
         self.assertEquals(out_data, MAPPING_FILE_DATA_CAT_G)
 
+        # check it remove columns that are unique and keeps all the other ones
+        out_data, out_headers = preprocess_mapping_file(self.mapping_file_data,
+                ['SampleID', 'BarcodeSequence', 'LinkerPrimerSequence',
+                 'Treatment', 'DOB', 'Description'], [None], unique=True)
+        self.assertEquals(out_headers, ['SampleID', 'LinkerPrimerSequence',
+            'Treatment', 'DOB'])
+        self.assertEquals(out_data, MAPPING_FILE_DATA_CAT_H)
+
     def test_keep_columns_from_mapping_file(self):
         """Check correct selection of metadata is being done"""
 
@@ -495,6 +503,16 @@ def test_sanitize_mapping_file(self):
 ['PC.636', 'ACGGTGAGTGTC', 'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116',
 'Fasting_mouse_I.D._636']]
 
+MAPPING_FILE_DATA_CAT_H = [['PC.354', 'YATGCTGCCTCCCGTAGGAGT', 'Control',
+'20061218'], ['PC.355', 'YATGCTGCCTCCCGTAGGAGT', 'Control', '20061218'],
+['PC.356', 'YATGCTGCCTCCCGTAGGAGT', 'Control', '20061126'], ['PC.481',
+'YATGCTGCCTCCCGTAGGAGT', 'Control', '20070314'], ['PC.593',
+'YATGCTGCCTCCCGTAGGAGT', 'Control', '20071210'], ['PC.607',
+'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20071112' ], ['PC.634',
+'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116'], ['PC.635',
+'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116'], ['PC.636',
+'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116' ]]
+
 MAPPING_FILE_DATA_GRADIENT = [
     ['PC.354', 'Control','3', '40', 'Control20061218'],
     ['PC.355', 'Control','9', '44', 'Control20061218'],