From bc0b27d1969d1ec072a5e7b62c98b6072de5afa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Fri, 5 Sep 2014 17:47:18 -0600 Subject: [PATCH 1/3] BUG: Fix problem with unique/single valued-columns If a category in the mapping file had a unique/single valued column, this would automatically be removed regardless of the fact that the user could have asked to keep the column. Some tests were fixed to reflect the case they were checking. Fixes #271 --- emperor/util.py | 17 ++++++++++------- tests/test_util.py | 6 +++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/emperor/util.py b/emperor/util.py index cd7fa06d..53171686 100644 --- a/emperor/util.py +++ b/emperor/util.py @@ -146,20 +146,23 @@ def preprocess_mapping_file(data, headers, columns, unique=False, single=False, line.append(''.join([line[index] for index in indices])) headers.append(new_column) - # remove all unique or singled valued columns + # remove all unique or singled valued columns that are not included in + # the list of categories that should be kept i. e. columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # find columns that have values that are all unique - if unique == True: - columns_to_remove += [column_name for column_name in headers[1::] - if metadata.hasUniqueCategoryValues(column_name)] + if unique: + for c in headers[1::]: + if metadata.hasUniqueCategoryValues(c) and c not in columns: + columns_to_remove.append(c) # remove categories where there is only one value - if single == True: - columns_to_remove += [column_name for column_name in headers[1::] - if metadata.hasSingleCategoryValue(column_name)] + if single: + for c in headers[1::]: + if metadata.hasSingleCategoryValue(c) and c not in columns: + columns_to_remove.append(c) columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns diff --git a/tests/test_util.py b/tests/test_util.py index 209f05c1..ce754a30 100755 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -121,8 +121,8 @@ def test_preprocess_mapping_file(self): # check it removes columns with unique values out_data, out_headers = preprocess_mapping_file(self.mapping_file_data, - self.mapping_file_headers, ['SampleID', 'BarcodeSequence', - 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description'], + self.mapping_file_headers, ['SampleID', 'LinkerPrimerSequence', + 'Treatment', 'DOB'], unique=True) self.assertEquals(out_headers, ['SampleID', 'LinkerPrimerSequence', 'Treatment', 'DOB']) @@ -131,7 +131,7 @@ def test_preprocess_mapping_file(self): # check it removes columns where there is only one value out_data, out_headers = preprocess_mapping_file(self.mapping_file_data, self.mapping_file_headers, ['SampleID', 'BarcodeSequence', - 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description'], + 'Treatment', 'DOB', 'Description'], single=True) self.assertEquals(out_headers,['SampleID', 'BarcodeSequence', 'Treatment', 'DOB', 'Description']) From fa232fd711a913daafc63df024dd31019952faaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Fri, 5 Sep 2014 17:56:52 -0600 Subject: [PATCH 2/3] TST: Add three more test cases related to #271 --- tests/test_util.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_util.py b/tests/test_util.py index ce754a30..9ad8d158 100755 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -151,6 +151,32 @@ def test_preprocess_mapping_file(self): self.assertEquals(out_data, MAPPING_FILE_DATA_DUPLICATED) self.assertEquals(out_headers, ['SampleID', 'Treatment', 'DOB']) + # check it doesn't remove columns because all are included in the list + out_data, out_headers = preprocess_mapping_file(self.mapping_file_data, + self.mapping_file_headers, ['SampleID', 'BarcodeSequence', + 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description'], + unique=True) + self.assertEquals(out_headers, ['SampleID', 'BarcodeSequence', + 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description']) + self.assertEquals(out_data, MAPPING_FILE_DATA_CAT_G) + + # check it doesn't remove columns because all are included in the list + out_data, out_headers = preprocess_mapping_file(self.mapping_file_data, + self.mapping_file_headers, ['SampleID', 'BarcodeSequence', + 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description'], + single=True) + self.assertEquals(out_headers, ['SampleID', 'BarcodeSequence', + 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description']) + self.assertEquals(out_data, MAPPING_FILE_DATA_CAT_G) + + # check it doesn't remove columns because all are included in the list + out_data, out_headers = preprocess_mapping_file(self.mapping_file_data, + self.mapping_file_headers, ['SampleID', 'BarcodeSequence', + 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description'], + unique=True, single=True) + self.assertEquals(out_headers, ['SampleID', 'BarcodeSequence', + 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description']) + self.assertEquals(out_data, MAPPING_FILE_DATA_CAT_G) def test_keep_columns_from_mapping_file(self): """Check correct selection of metadata is being done""" @@ -438,6 +464,20 @@ def test_sanitize_mapping_file(self): ['PC.635', 'Fast', 'Fast20080116'], ['PC.636', 'Fast', 'Fast20080116']] +MAPPING_FILE_DATA_CAT_G = [['PC.354', 'AGCACGAGCCTA', 'YATGCTGCCTCCCGTAGGAGT', +'Control', '20061218', 'Control_mouse_I.D._354'], ['PC.355', 'AACTCGTCGATG', +'YATGCTGCCTCCCGTAGGAGT', 'Control', '20061218', 'Control_mouse_I.D._355'], +['PC.356', 'ACAGACCACTCA', 'YATGCTGCCTCCCGTAGGAGT', 'Control', '20061126', +'Control_mouse_I.D._356'], ['PC.481', 'ACCAGCGACTAG', 'YATGCTGCCTCCCGTAGGAGT', +'Control', '20070314', 'Control_mouse_I.D._481'], ['PC.593', 'AGCAGCACTTGT', +'YATGCTGCCTCCCGTAGGAGT', 'Control', '20071210', 'Control_mouse_I.D._593'], +['PC.607', 'AACTGTGCGTAC', 'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20071112', +'Fasting_mouse_I.D._607'], ['PC.634', 'ACAGAGTCGGCT', 'YATGCTGCCTCCCGTAGGAGT', +'Fast', '20080116', 'Fasting_mouse_I.D._634'], ['PC.635', 'ACCGCAGAGTCA', +'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116', 'Fasting_mouse_I.D._635'], +['PC.636', 'ACGGTGAGTGTC', 'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116', +'Fasting_mouse_I.D._636']] + MAPPING_FILE_DATA_GRADIENT = [ ['PC.354', 'Control','3', '40', 'Control20061218'], ['PC.355', 'Control','9', '44', 'Control20061218'], From 76da557d7cd1ae04d4932522b9a548dee88683ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Fri, 5 Sep 2014 18:00:04 -0600 Subject: [PATCH 3/3] DOC: Add note about the fix of #271 --- ChangeLog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.md b/ChangeLog.md index 4242957d..4af2d8c5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -19,6 +19,7 @@ Emperor 0.9.3-dev (changes since Emperor 0.9.2 go here) * Fixed problem where coordinate files with large values (greater than 100) would not be displayed on screen. * Fixed problem that prevented the user from scrolling through the categories in the user interface. +* Fixed problem that removed unique/single-valued categories in the mapping file even if these were selected with `--color_by`. * Clean-up the layout of the user interface so it's cleaner and consistent. * Fix problem where long category names would alter the layout of the interface. * Fix inability to write an 'E' character in the Filename field when exporting an svg.