Merge pull request #1494 from josenavas/ebi-improvements

Solving merge conflicts
qiita-spots · Oct 10, 2015 · 1cee0a3 · 1cee0a3
2 parents 0a47d89 + e17520a
commit 1cee0a3
Show file tree

Hide file tree

Showing 8 changed files with 290 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@ Version 0.2.0-dev (Changes since version 0.2.0 go here)
 -------------------------------------------------------
 
 * Users can now change values and add samples and/or columns to sample and prep templates using the <kbd>Update</kbd> button (see the prep template and sample template tabs).
+* The raw files of a RawData can be now updated using the `qiita db update_raw_data` CLI command.
 * instrument_model is now a required prep template column for EBI submissions.
 
 Version 0.2.0 (2015-08-25)

diff --git a/qiita_db/commands.py b/qiita_db/commands.py
@@ -18,7 +18,7 @@
 from .study import Study, StudyPerson
 from .user import User
 from .util import (get_filetypes, get_filepath_types, compute_checksum,
-                   convert_to_id)
+                   convert_to_id, move_filepaths_to_upload_folder)
 from .data import RawData, PreprocessedData, ProcessedData
 from .metadata_template import (SampleTemplate, PrepTemplate,
                                 load_template_to_dataframe)
@@ -302,6 +302,73 @@ def load_parameters_from_cmd(name, fp, table):
     return constructor.create(name, **params)
 
 
+def update_raw_data_from_cmd(filepaths, filepath_types, study_id, rd_id=None):
+    """Updates the raw data of the study 'study_id'
+
+    Parameters
+    ----------
+    filepaths : iterable of str
+        Paths to the raw data files
+    filepath_types : iterable of str
+        Describes the contents of the files
+    study_id : int
+        The study_id of the study to be updated
+    rd_id : int, optional
+        The id of the raw data to be updated. If not provided, the raw data
+        with lowest id in the study will be updated
+
+    Returns
+    -------
+    qiita_db.data.RawData
+
+    Raises
+    ------
+    ValueError
+        If 'filepaths' and 'filepath_types' do not have the same length
+        If the study does not have any raw data
+        If rd_id is provided and it does not belong to the given study
+    """
+    if len(filepaths) != len(filepath_types):
+        raise ValueError("Please provide exactly one filepath_type for each "
+                         "and every filepath")
+    with TRN:
+        study = Study(study_id)
+        raw_data_ids = study.raw_data()
+        if not raw_data_ids:
+            raise ValueError("Study %d does not have any raw data" % study_id)
+
+        if rd_id:
+            if rd_id not in raw_data_ids:
+                raise ValueError(
+                    "The raw data %d does not exist in the study %d. Available"
+                    " raw data: %s"
+                    % (rd_id, study_id, ', '.join(map(str, raw_data_ids))))
+            raw_data = RawData(rd_id)
+        else:
+            raw_data = RawData(sorted(raw_data_ids)[0])
+
+        filepath_types_dict = get_filepath_types()
+        try:
+            filepath_types = [filepath_types_dict[x] for x in filepath_types]
+        except KeyError:
+            supported_types = filepath_types_dict.keys()
+            unsupported_types = set(filepath_types).difference(supported_types)
+            raise ValueError(
+                "Some filepath types provided are not recognized (%s). "
+                "Please choose from: %s"
+                % (', '.join(unsupported_types), ', '.join(supported_types)))
+
+        fps = raw_data.get_filepaths()
+        sql = "DELETE FROM qiita.raw_filepath WHERE raw_data_id = %s"
+        TRN.add(sql, [raw_data.id])
+        TRN.execute()
+        move_filepaths_to_upload_folder(study_id, fps)
+
+        raw_data.add_filepaths(list(zip(filepaths, filepath_types)))
+
+    return raw_data
+
+
 def update_preprocessed_data_from_cmd(sl_out_dir, study_id, ppd_id=None):
     """Updates the preprocessed data of the study 'study_id'
 
@@ -346,7 +413,8 @@ def update_preprocessed_data_from_cmd(sl_out_dir, study_id, ppd_id=None):
         study = Study(study_id)
         ppds = study.preprocessed_data()
         if not ppds:
-            raise ValueError("Study %s does not have any preprocessed data")
+            raise ValueError("Study %s does not have any preprocessed data",
+                             study_id)
 
         if ppd_id:
             if ppd_id not in ppds:

diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py
@@ -1162,19 +1162,15 @@ def update(self, md_template):
                     % ', '.join(columns_diff))
 
             # In order to speed up some computation, let's compare only the
-            # common columns. current_map.columns is a superset of
-            # new_map.columns, so this will not fail
-            current_map = current_map[new_map.columns]
+            # common columns and rows. current_map.columns and
+            # current_map.index are supersets of new_map.columns and
+            # new_map.index, respectivelly, so this will not fail
+            current_map = current_map[new_map.columns].loc[new_map.index]
 
             # Get the values that we need to change
             # diff_map is a DataFrame that hold boolean values. If a cell is
             # True, means that the new_map is different from the current_map
             # while False means that the cell has the same value
-            # In order to compare them, they've to be identically labeled, so
-            # we need to sort the 'index' axis to be identically labeled. The
-            # 'column' axis is already the same given the previous line of code
-            current_map.sort_index(axis='index', inplace=True)
-            new_map.sort_index(axis='index', inplace=True)
             diff_map = current_map != new_map
             # ne_stacked holds a MultiIndexed DataFrame in which the first
             # level of indexing is the sample_name and the second one is the

diff --git a/qiita_db/metadata_template/test/test_sample_template.py b/qiita_db/metadata_template/test/test_sample_template.py
@@ -1227,6 +1227,19 @@ def test_update(self):
         with self.assertRaises(QiitaDBError):
             st.update(self.metadata_dict_updated_column_error)
 
+    def test_update_fewer_samples(self):
+        """Updates using a dataframe with less samples that in the DB"""
+        st = SampleTemplate.create(self.metadata, self.new_study)
+        new_metadata = pd.DataFrame.from_dict(
+            {'Sample1': {'physical_specimen_location': 'CHANGE'}},
+            orient='index')
+        exp = {s_id: st[s_id]._to_dict() for s_id in st}
+        s_id = '%d.Sample1' % self.new_study.id
+        exp[s_id]['physical_specimen_location'] = 'CHANGE'
+        npt.assert_warns(QiitaDBWarning, st.update, new_metadata)
+        obs = {s_id: st[s_id]._to_dict() for s_id in st}
+        self.assertEqual(obs, exp)
+
     def test_update_numpy(self):
         """Update values in existing mapping file with numpy values"""
         metadata_dict = {

diff --git a/qiita_db/test/test_commands.py b/qiita_db/test/test_commands.py
@@ -14,6 +14,7 @@
 from future.utils.six import StringIO
 from future import standard_library
 from functools import partial
+from operator import itemgetter
 
 import pandas as pd
 
@@ -23,13 +24,15 @@
                                load_processed_data_cmd,
                                load_preprocessed_data_from_cmd,
                                load_parameters_from_cmd,
+                               update_raw_data_from_cmd,
                                update_preprocessed_data_from_cmd)
 from qiita_db.environment_manager import patch
 from qiita_db.study import Study, StudyPerson
 from qiita_db.user import User
-from qiita_db.data import PreprocessedData
+from qiita_db.data import PreprocessedData, RawData
 from qiita_db.util import (get_count, check_count, get_db_files_base_dir,
-                           get_mountpoint)
+                           get_mountpoint, compute_checksum,
+                           get_files_from_uploads_folders)
 from qiita_db.metadata_template import PrepTemplate
 from qiita_core.util import qiita_test_checker
 from qiita_ware.processing_pipeline import generate_demux_file
@@ -453,6 +456,111 @@ def test_python_patch(self):
         self._assert_current_patch('10.sql')
 
 
+@qiita_test_checker()
+class TestUpdateRawDataFromCmd(TestCase):
+    def setUp(self):
+        fd, seqs_fp = mkstemp(suffix='_seqs.fastq')
+        close(fd)
+        fd, barcodes_fp = mkstemp(suffix='_barcodes.fastq')
+        close(fd)
+        self.filepaths = [seqs_fp, barcodes_fp]
+        self.checksums = []
+        for fp in sorted(self.filepaths):
+            with open(fp, 'w') as f:
+                f.write("%s\n" % fp)
+            self.checksums.append(compute_checksum(fp))
+        self.filepaths_types = ["raw_forward_seqs", "raw_barcodes"]
+        self._clean_up_files = [seqs_fp, barcodes_fp]
+
+        info = {
+            "timeseries_type_id": 1,
+            "metadata_complete": True,
+            "mixs_compliant": True,
+            "number_samples_collected": 25,
+            "number_samples_promised": 28,
+            "study_alias": "FCM",
+            "study_description": "Microbiome of people who eat nothing but "
+                                 "fried chicken",
+            "study_abstract": "Exploring how a high fat diet changes the "
+                              "gut microbiome",
+            "emp_person_id": StudyPerson(2),
+            "principal_investigator_id": StudyPerson(3),
+            "lab_person_id": StudyPerson(1)
+        }
+        self.new_study = Study.create(User("test@foo.bar"),
+                                      "Update raw data test",
+                                      efo=[1], info=info)
+        self.study = Study(1)
+        # The files for the RawData object attached to study 1 does not exist.
+        # Create them so we can actually perform the tests
+        for _, fp, _ in RawData(1).get_filepaths():
+            with open(fp, 'w') as f:
+                f.write('\n')
+            self._clean_up_files.append(fp)
+
+        self.uploaded_files = get_files_from_uploads_folders(
+            str(self.study.id))
+
+    def tearDown(self):
+        new_uploaded_files = get_files_from_uploads_folders(str(self.study.id))
+        new_files = set(new_uploaded_files).difference(self.uploaded_files)
+        path_builder = partial(join, get_mountpoint("uploads")[0][1], '1')
+        for _, fp in new_files:
+            self._clean_up_files.append(path_builder(fp))
+        for f in self._clean_up_files:
+            if exists(f):
+                remove(f)
+
+    def test_update_raw_data_from_cmd_diff_length(self):
+        with self.assertRaises(ValueError):
+            update_raw_data_from_cmd(self.filepaths[1:], self.filepaths_types,
+                                     self.study.id)
+        with self.assertRaises(ValueError):
+            update_raw_data_from_cmd(self.filepaths, self.filepaths_types[1:],
+                                     self.study.id)
+
+    def test_update_raw_data_from_cmd_no_raw_data(self):
+        with self.assertRaises(ValueError):
+            update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
+                                     self.new_study.id)
+
+    def test_update_raw_data_from_cmd_wrong_raw_data_id(self):
+        # Using max(raw_data_ids) + 1 to make sure that the raw data id
+        # passed does not belong to the study
+        with self.assertRaises(ValueError):
+            update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
+                                     self.study.id,
+                                     max(self.study.raw_data()) + 1)
+
+    def test_update_raw_data_from_cmd(self):
+        rd = update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
+                                      self.study.id)
+        # Make sure that we are cleaning the environment
+        for _, fp, _ in rd.get_filepaths():
+            self._clean_up_files.append(fp)
+
+        # The checkums are in filepath order. If we sort the rd.get_filepath()
+        # result by the filepath (itemgetter(1)) we will get them in the same
+        # order, so the checksums will not fail
+        for obs, exp in zip(sorted(rd.get_filepaths(), key=itemgetter(1)),
+                            self.checksums):
+            self.assertEqual(compute_checksum(obs[1]), exp)
+
+    def test_update_raw_data_from_cmd_rd_id(self):
+        rd = update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
+                                      self.study.id, self.study.raw_data()[0])
+        # Make sure that we are cleaning the environment
+        for _, fp, _ in rd.get_filepaths():
+            self._clean_up_files.append(fp)
+
+        # The checkums are in filepath order. If we sort the rd.get_filepath()
+        # result by the filepath (itemgetter(1)) we will get them in the same
+        # order, so the checksums will not fail
+        for obs, exp in zip(sorted(rd.get_filepaths(), key=itemgetter(1)),
+                            self.checksums):
+            self.assertEqual(compute_checksum(obs[1]), exp)
+
+
 @qiita_test_checker()
 class TestUpdatePreprocessedDataFromCmd(TestCase):
     def setUp(self):

diff --git a/qiita_ware/processing_pipeline.py b/qiita_ware/processing_pipeline.py
@@ -24,12 +24,13 @@ def _get_qiime_minimal_mapping(prep_template, out_dir):
     """Generates a minimal QIIME-compliant mapping file for split libraries
 
     The columns of the generated file are, in order: SampleID, BarcodeSequence,
-    LinkerPrimerSequence, Description. All values are taken from the prep
-    template except for Description, which always receive the value "Qiita MMF"
+    LinkerPrimerSequence, [ReverseLinkerPrimer] Description. All values are
+    taken from the prep template except for Description, which always receive
+    the value "Qiita MMF"
 
     Parameters
     ----------
-    prep_template : PrepTemplate
+    prep_template : qiita_db.metadata_template.PrepTemplate
         The prep template from which we need to generate the minimal mapping
     out_dir : str
         Path to the output directory
@@ -40,42 +41,45 @@ def _get_qiime_minimal_mapping(prep_template, out_dir):
         The paths to the qiime minimal mapping files
     """
     from functools import partial
-    from collections import defaultdict
     from os.path import join
     import pandas as pd
 
-    # The prep templates has a QIIME mapping file, get it
-    qiime_map = pd.read_csv(prep_template.qiime_map_fp, sep='\t',
-                            keep_default_na=False, na_values=['unknown'],
-                            index_col=False,
-                            converters=defaultdict(lambda: str))
-    qiime_map.set_index('#SampleID', inplace=True, drop=True)
+    pt_df = prep_template.to_dataframe()
 
-    # We use our own description to avoid potential processing problems
-    qiime_map['Description'] = pd.Series(['Qiita MMF'] * len(qiime_map.index),
-                                         index=qiime_map.index)
+    rename_cols = {
+        'barcode': 'BarcodeSequence',
+        'primer': 'LinkerPrimerSequence',
+    }
 
-    # We ensure the order of the columns as QIIME is expecting
-    if 'ReverseLinkerPrimer' in qiime_map:
+    # Ensure the order of the columns as QIIME is expecting
+    if 'reverselinkerprimer' in pt_df:
+        rename_cols['reverselinkerprimer'] = 'ReverseLinkerPrimer'
         cols = ['BarcodeSequence', 'LinkerPrimerSequence',
                 'ReverseLinkerPrimer', 'Description']
     else:
         cols = ['BarcodeSequence', 'LinkerPrimerSequence', 'Description']
 
+    pt_df.rename(columns=rename_cols, inplace=True)
+
+    # Sometimes, the Description column can generate some problems in QIIME,
+    # depending on its values. We set it up to read Qiita MMF for all rows
+    pt_df['Description'] = pd.Series(['Qiita MMF'] * len(pt_df.index),
+                                     index=pt_df.index)
+
     path_builder = partial(join, out_dir)
-    if 'run_prefix' in qiime_map:
+    if 'run_prefix' in pt_df:
         # The study potentially has more than 1 lane, so we should generate a
         # qiime MMF for each of the lanes. We know how to split the prep
         # template based on the run_prefix column
         output_fps = []
-        for prefix, df in qiime_map.groupby('run_prefix'):
+        for prefix, df in pt_df.groupby('run_prefix'):
             df = df[cols]
             out_fp = path_builder("%s_MMF.txt" % prefix)
             output_fps.append(out_fp)
             df.to_csv(out_fp, index_label="#SampleID", sep='\t')
     else:
         # The study only has one lane, just write the MMF
-        df = qiime_map[cols]
+        df = pt_df[cols]
         out_fp = path_builder("prep_%d_MMF.txt" % prep_template.id)
         output_fps = [out_fp]
         df.to_csv(out_fp, index_label="#SampleID", sep='\t')