Merge a8743ff into 103e0e5

qiita-spots · Dec 28, 2016 · d23ada6 · d23ada6
2 parents 103e0e5 + a8743ff
commit d23ada6
Show file tree

Hide file tree

Showing 11 changed files with 161 additions and 90 deletions.
diff --git a/qiita_db/handlers/tests/test_prep_template.py b/qiita_db/handlers/tests/test_prep_template.py
@@ -122,7 +122,8 @@ def test_get(self):
             'sequencing_meth': 'Sequencing by synthesis',
             'study_center': 'CCME',
             'target_gene': '16S rRNA',
-            'target_subfragment': 'V4'}
+            'target_subfragment': 'V4',
+            'qiita_prep_id': '1'}
         self.assertEqual(obs, exp)
 
 

diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py
@@ -514,21 +514,9 @@ def _clean_validate_template(cls, md_template, study_id,
         md_template.columns = [c.lower() for c in md_template.columns]
         # validating pgsql reserved words not to be column headers
         current_headers = set(md_template.columns.values)
-        reserved_words = qdb.metadata_template.util.get_pgsql_reserved_words()
-        overlap = reserved_words & current_headers
-        if overlap:
-            raise qdb.exceptions.QiitaDBColumnError(
-                "The following column names in the template contain PgSQL "
-                "reserved words: %s. You need to modify them." % ", ".join(
-                    overlap))
-        # validating invalid column names
-        invalid_ids = qdb.metadata_template.util.get_invalid_column_names(
+
+        qdb.metadata_template.util.validate_invalid_column_names(
             current_headers)
-        if invalid_ids:
-            raise qdb.exceptions.QiitaDBColumnError(
-                "The following column names in the template contain invalid "
-                "chars: %s. You need to modify them." % ", ".join(
-                    invalid_ids))
 
         # Prefix the sample names with the study_id
         qdb.metadata_template.util.prefix_sample_names_with_id(md_template,
@@ -1074,6 +1062,8 @@ def to_dataframe(self):
             # Make sure that we are changing np.NaN by Nones
             df.where((pd.notnull(df)), None)
             df.set_index('sample_id', inplace=True, drop=True)
+            id_column_name = 'qiita_%sid' % (self._table_prefix)
+            df[id_column_name] = str(self.id)
 
             return df
 

diff --git a/qiita_db/metadata_template/test/test_prep_template.py b/qiita_db/metadata_template/test/test_prep_template.py
@@ -656,7 +656,7 @@ def test_to_dataframe(self):
             u'experiment_design_description', u'experiment_title', u'platform',
             u'instrument_model', u'samp_size', u'sequencing_meth',
             u'illumina_technology', u'sample_center', u'pcr_primers',
-            u'study_center'})
+            u'study_center', 'qiita_prep_id'})
 
     def test_clean_validate_template_error_bad_chars(self):
         """Raises an error if there are invalid characters in the sample names
@@ -1081,7 +1081,7 @@ def test_to_file(self):
         self._clean_up_files.append(fp)
         with open(fp, 'U') as f:
             obs = f.read()
-        self.assertEqual(obs, EXP_PREP_TEMPLATE)
+        self.assertEqual(obs, EXP_PREP_TEMPLATE.format(pt.id))
 
     def test_investigation_type_setter(self):
         """Able to update the investigation type"""
@@ -1494,15 +1494,15 @@ def test_delete_sample(self):
     'sample_name\tbarcode\tcenter_name\tcenter_project_name\t'
     'ebi_submission_accession\temp_status\texperiment_design_description\t'
     'instrument_model\tlibrary_construction_protocol\tplatform\tprimer\t'
-    'run_prefix\tstr_column\n'
+    'qiita_prep_id\trun_prefix\tstr_column\n'
     '1.SKB7.640196\tCCTCTGAGAGCT\tANL\tTest Project\t\tEMP\tBBBB\t'
-    'Illumina MiSeq\tAAAA\tILLUMINA\tGTGCCAGCMGCCGCGGTAA\t'
+    'Illumina MiSeq\tAAAA\tILLUMINA\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
     's_G1_L002_sequences\tValue for sample 3\n'
     '1.SKB8.640193\tGTCCGCAAGTTA\tANL\tTest Project\t\tEMP\tBBBB\t'
-    'Illumina MiSeq\tAAAA\tILLUMINA\tGTGCCAGCMGCCGCGGTAA\t'
+    'Illumina MiSeq\tAAAA\tILLUMINA\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
     's_G1_L001_sequences\tValue for sample 1\n'
     '1.SKD8.640184\tCGTAGAGCTCTC\tANL\tTest Project\t\tEMP\tBBBB\t'
-    'Illumina MiSeq\tAAAA\tILLUMINA\tGTGCCAGCMGCCGCGGTAA\t'
+    'Illumina MiSeq\tAAAA\tILLUMINA\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
     's_G1_L001_sequences\tValue for sample 2\n')
 
 

diff --git a/qiita_db/metadata_template/test/test_sample_template.py b/qiita_db/metadata_template/test/test_sample_template.py
@@ -1806,8 +1806,9 @@ def test_to_dataframe(self):
             self.metadata, self.new_study)
         obs = st.to_dataframe()
 
+        new_id = self.new_study.id
         exp_dict = {
-            '%s.Sample1' % self.new_study.id: {
+            '%s.Sample1' % new_id: {
                 'physical_specimen_location': 'location1',
                 'physical_specimen_remaining': 'true',
                 'dna_extracted': 'true',
@@ -1819,8 +1820,9 @@ def test_to_dataframe(self):
                 'latitude': '42.42',
                 'longitude': '41.41',
                 'taxon_id': '9606',
+                'qiita_sample_id': str(new_id),
                 'scientific_name': 'homo sapiens'},
-            '%s.Sample2' % self.new_study.id: {
+            '%s.Sample2' % new_id: {
                 'physical_specimen_location': 'location1',
                 'physical_specimen_remaining': 'true',
                 'dna_extracted': 'true',
@@ -1832,8 +1834,9 @@ def test_to_dataframe(self):
                 'latitude': '4.2',
                 'longitude': '1.1',
                 'taxon_id': '9606',
+                'qiita_sample_id': str(new_id),
                 'scientific_name': 'homo sapiens'},
-            '%s.Sample3' % self.new_study.id: {
+            '%s.Sample3' % new_id: {
                 'physical_specimen_location': 'location1',
                 'physical_specimen_remaining': 'true',
                 'dna_extracted': 'true',
@@ -1845,6 +1848,7 @@ def test_to_dataframe(self):
                 'latitude': '4.8',
                 'longitude': '4.41',
                 'taxon_id': '9606',
+                'qiita_sample_id': str(new_id),
                 'scientific_name': 'homo sapiens'},
             }
         exp = pd.DataFrame.from_dict(exp_dict, orient='index', dtype=str)
@@ -1881,7 +1885,7 @@ def test_to_dataframe(self):
             'water_content_soil', 'elevation', 'temp', 'tot_nitro',
             'samp_salinity', 'altitude', 'env_biome', 'country', 'ph',
             'anonymized_name', 'tot_org_carb', 'description_duplicate',
-            'env_feature', 'scientific_name'})
+            'env_feature', 'scientific_name', 'qiita_sample_id'})
 
     def test_check_restrictions(self):
         obs = self.tester.check_restrictions(
@@ -2166,22 +2170,24 @@ def test_delete_sample(self):
 EXP_SAMPLE_TEMPLATE = (
     "sample_name\tcollection_timestamp\tdescription\tdna_extracted\t"
     "host_subject_id\tlatitude\tlongitude\tphysical_specimen_location\t"
-    "physical_specimen_remaining\tsample_type\tscientific_name\ttaxon_id\n"
+    "physical_specimen_remaining\tqiita_sample_id\tsample_type\t"
+    "scientific_name\ttaxon_id\n"
     "{0}.Sample1\t05/29/2014 12:24:15\tTest Sample 1\ttrue\tNotIdentified\t"
-    "42.42\t41.41\tlocation1\ttrue\ttype1\thomo sapiens\t9606\n"
+    "42.42\t41.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n"
     "{0}.Sample2\t05/29/2014 12:24:15\tTest Sample 2\ttrue\tNotIdentified\t"
-    "4.2\t1.1\tlocation1\ttrue\ttype1\thomo sapiens\t9606\n"
+    "4.2\t1.1\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n"
     "{0}.Sample3\t05/29/2014 12:24:15\tTest Sample 3\ttrue\tNotIdentified\t"
-    "4.8\t4.41\tlocation1\ttrue\ttype1\thomo sapiens\t9606\n")
+    "4.8\t4.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n")
 
 EXP_SAMPLE_TEMPLATE_FEWER_SAMPLES = (
     "sample_name\tcollection_timestamp\tdescription\tdna_extracted\t"
     "host_subject_id\tlatitude\tlongitude\tphysical_specimen_location\t"
-    "physical_specimen_remaining\tsample_type\tscientific_name\ttaxon_id\n"
+    "physical_specimen_remaining\tqiita_sample_id\tsample_type\t"
+    "scientific_name\ttaxon_id\n"
     "{0}.Sample1\t05/29/2014 12:24:15\tTest Sample 1\ttrue\tNotIdentified\t"
-    "42.42\t41.41\tlocation1\ttrue\ttype1\thomo sapiens\t9606\n"
+    "42.42\t41.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n"
     "{0}.Sample3\t05/29/2014 12:24:15\tTest Sample 3\ttrue\tNotIdentified\t"
-    "4.8\t4.41\tlocation1\ttrue\ttype1\thomo sapiens\t9606\n")
+    "4.8\t4.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n")
 
 
 if __name__ == '__main__':

diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py
@@ -207,12 +207,48 @@ def test_get_get_invalid_sample_names_mixed(self):
         obs = qdb.metadata_template.util.get_invalid_sample_names(one_invalid)
         self.assertItemsEqual(obs, [' ', ' ', ' '])
 
-    def test_get_invalid_column_names(self):
-        invalid = ['tax on', 'bla.', '.', '{', 'this|is', '4column']
-        valid = ['fine', 'select']
-        obs = qdb.metadata_template.util.get_invalid_column_names(
-            invalid + valid)
-        self.assertEqual(obs, invalid)
+    def test_validate_invalid_column_names(self):
+        # testing just pgsql
+        pgsql = ['select', 'column', 'just_fine1']
+        with self.assertRaises(qdb.exceptions.QiitaDBColumnError) as error:
+            qdb.metadata_template.util.validate_invalid_column_names(pgsql)
+        self.assertEqual(
+            str(error.exception),
+            'The following column names in the template contain PgSQL '
+            'reserved words: column, select.\nYou need to modify them.')
+
+        # testing just wrong chars
+        invalid = ['tax on', 'bla.', '.', '{', 'this|is',
+                   '4column', 'just_fine2']
+        with self.assertRaises(qdb.exceptions.QiitaDBColumnError) as error:
+            qdb.metadata_template.util.validate_invalid_column_names(invalid)
+        self.assertEqual(
+            str(error.exception),
+            'The following column names in the template contain invalid '
+            'chars: bla., ., tax on, this|is, {, 4column.\nYou need to '
+            'modify them.')
+
+        # testing just forbidden
+        forbidden = ['sampleid', 'just_fine3']
+        with self.assertRaises(qdb.exceptions.QiitaDBColumnError) as error:
+            qdb.metadata_template.util.validate_invalid_column_names(forbidden)
+        self.assertEqual(
+            str(error.exception),
+            'The following column names in the template contain invalid '
+            'values: sampleid.\nYou need to modify them.')
+
+        # testing all
+        _all = pgsql + invalid + forbidden
+        with self.assertRaises(qdb.exceptions.QiitaDBColumnError) as error:
+            qdb.metadata_template.util.validate_invalid_column_names(_all)
+        self.assertEqual(
+            str(error.exception),
+            'The following column names in the template contain PgSQL '
+            'reserved words: column, select.\n'
+            'The following column names in the template contain invalid '
+            'chars: this|is, ., tax on, bla., {, 4column.\n'
+            'The following column names in the template contain invalid '
+            'values: sampleid.\nYou need to modify them.')
 
     def test_looks_like_qiime_mapping_file(self):
         obs = qdb.metadata_template.util.looks_like_qiime_mapping_file(

diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
@@ -238,34 +238,68 @@ def get_invalid_sample_names(sample_names):
     return inv
 
 
-def get_invalid_column_names(column_names):
-    """Get a list of column names that are not SQL compliant
+def validate_invalid_column_names(column_names):
+    """Validate a list of column names that are not SQL compliant
 
     Parameters
     ----------
     column_names : iterable
         Iterable containing the column names to check.
 
-    Returns
-    -------
-    list
-        List of str objects where each object is an invalid column name.
+    Raises
+    ------
+    QiitaDBColumnError
+        If column_name is in get_pgsql_reserved_words or contains invalid
+        chars or is within the forbidden_values
 
     References
     ----------
     .. [1] postgresql SQL-SYNTAX-IDENTIFIERS: https://goo.gl/EF0cUV.
     """
+    column_names = set(column_names)
+
+    # testing for specific column names that are not included in the other
+    # tests.
+    forbidden_values = {
+        # https://github.com/biocore/qiita/issues/2026
+        'sampleid',
+        # https://github.com/biocore/qiita/issues/1866
+        'qiita_study_id',
+        'qiita_prep_id'
+    }
+    forbidden = forbidden_values & column_names
+
+    # pgsql reserved words
+    pgsql_reserved = (
+        qdb.metadata_template.util.get_pgsql_reserved_words() & column_names)
+
+    # invalid letters in headers
     valid_initial_char = letters
     valid_rest = set(letters+digits+'_')
-    inv = []
-
+    invalid = []
     for s in column_names:
         if s[0] not in valid_initial_char:
-            inv.append(s)
+            invalid.append(s)
         elif set(s) - valid_rest:
-            inv.append(s)
-
-    return inv
+            invalid.append(s)
+
+    error = []
+    if pgsql_reserved:
+        error.append(
+            "The following column names in the template contain PgSQL "
+            "reserved words: %s." % ", ".join(pgsql_reserved))
+    if invalid:
+        error.append(
+            "The following column names in the template contain invalid "
+            "chars: %s." % ", ".join(invalid))
+    if forbidden:
+        error.append(
+            "The following column names in the template contain invalid "
+            "values: %s." % ", ".join(forbidden))
+
+    if error:
+        raise qdb.exceptions.QiitaDBColumnError(
+            "%s\nYou need to modify them." % '\n'.join(error))
 
 
 def looks_like_qiime_mapping_file(fp):