Merge pull request #129 from adamrp/load_processed_data

Load processed data
qiita-spots · Jun 20, 2014 · 2dc78a3 · 2dc78a3
2 parents ed57160 + 067767b
commit 2dc78a3
Show file tree

Hide file tree

Showing 8 changed files with 1,708 additions and 1,658 deletions.
diff --git a/qiita_db/commands.py b/qiita_db/commands.py
@@ -6,6 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
 
+from dateutil.parser import parse
 import pandas as pd
 from functools import partial
 try:
@@ -18,7 +19,7 @@
 from .study import Study, StudyPerson
 from .user import User
 from .util import get_filetypes, get_filepath_types
-from .data import RawData
+from .data import RawData, PreprocessedData, ProcessedData
 from .metadata_template import SampleTemplate
 
 
@@ -108,3 +109,50 @@ def load_raw_data_cmd(filepaths, filepath_types, filetype, study_ids):
 
     return RawData.create(filetype_id, list(zip(filepaths, filepath_types)),
                           studies)
+
+
+def load_processed_data_cmd(fps, fp_types, processed_params_table_name,
+                            processed_params_id, preprocessed_data_id=None,
+                            processed_date=None):
+    """Add a new processed data entry
+
+    Parameters
+    ----------
+    fps : list of str
+        Paths to the processed data files to associate with the ProcessedData
+        object
+    fp_types: list of str
+        The types of files, one per fp
+    processed_params_table_name : str
+        The name of the processed_params_ table to use
+    processed_params_id : int
+        The ID of the row in the processed_params_ table
+    preprocessed_data_id : int, optional
+        Defaults to ``None``. The ID of the row in the preprocessed_data table.
+    processed_date : str, optional
+        Defaults to ``None``. The date and time to use as the processing date.
+        Must be interpretable as a datetime object
+
+    Returns
+    -------
+    qiita_db.ProcessedData
+        The newly created `qiita_db.ProcessedData` object
+    """
+    if len(fps) != len(fp_types):
+        raise ValueError("Please pass exactly one fp_type for each "
+                         "and every fp")
+
+    fp_types_dict = get_filepath_types()
+    fp_types = [fp_types_dict[x] for x in fp_types]
+
+    if preprocessed_data_id is not None:
+        preprocessed_data = PreprocessedData(preprocessed_data_id)
+    else:
+        preprocessed_data = None
+
+    if processed_date is not None:
+        processed_date = parse(processed_date)
+
+    return ProcessedData.create(processed_params_table_name,
+                                processed_params_id, list(zip(fps, fp_types)),
+                                preprocessed_data, processed_date)
diff --git a/qiita_db/support_files/qiita-db.dbs b/qiita_db/support_files/qiita-db.dbs
@@ -635,10 +635,8 @@ Linked by y being raw_data_id from raw data table.</comment>
 		<table name="processed_filepath" >
 			<column name="processed_data_id" type="bigint" jt="-5" mandatory="y" />
 			<column name="filepath_id" type="bigint" jt="-5" mandatory="y" />
-			<index name="pk_processed_data_filepath" unique="UNIQUE" >
+			<index name="idx_processed_filepath" unique="PRIMARY_KEY" >
 				<column name="processed_data_id" />
-			</index>
-			<index name="idx_processed_data_filepath" unique="NORMAL" >
 				<column name="filepath_id" />
 			</index>
 			<fk name="fk_processed_data_filepath" to_schema="qiita" to_table="processed_data" >
@@ -1262,8 +1260,8 @@ Controlled Vocabulary]]></comment>
 		<entity schema="qiita" name="raw_data" color="d0def5" x="1230" y="480" />
 		<entity schema="qiita" name="raw_preprocessed_data" color="b2cdf7" x="1230" y="585" />
 		<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="705" />
-		<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
 		<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
+		<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
 		<group name="Group_analyses" color="c4e0f9" >
 			<comment>analysis tables</comment>
 			<entity schema="qiita" name="analysis" />

diff --git a/qiita_db/support_files/qiita-db.html b/qiita_db/support_files/qiita-db.html
diff --git a/qiita_db/support_files/qiita-db.sql b/qiita_db/support_files/qiita-db.sql
@@ -456,13 +456,11 @@ CREATE INDEX idx_preprocessed_processed_data_1 ON qiita.preprocessed_processed_d
 CREATE TABLE qiita.processed_filepath ( 
 	processed_data_id    bigint  NOT NULL,
 	filepath_id          bigint  NOT NULL,
-	CONSTRAINT pk_processed_data_filepath UNIQUE ( processed_data_id ) ,
+	CONSTRAINT idx_processed_filepath PRIMARY KEY ( processed_data_id, filepath_id ),
 	CONSTRAINT fk_processed_data_filepath FOREIGN KEY ( processed_data_id ) REFERENCES qiita.processed_data( processed_data_id )    ,
 	CONSTRAINT fk_processed_data_filepath_0 FOREIGN KEY ( filepath_id ) REFERENCES qiita.filepath( filepath_id )    
  );
 
-CREATE INDEX idx_processed_data_filepath ON qiita.processed_filepath ( filepath_id );
-
 CREATE TABLE qiita.processed_params_uclust ( 
 	processed_params_id  bigserial  NOT NULL,
 	reference_id         bigint  NOT NULL,

diff --git a/qiita_db/test/test_commands.py b/qiita_db/test/test_commands.py
@@ -19,10 +19,11 @@
     from configparser import NoOptionError
 
 from qiita_db.commands import (make_study_from_cmd, load_raw_data_cmd,
-                               sample_template_adder)
+                               sample_template_adder, load_processed_data_cmd)
 from qiita_db.study import Study, StudyPerson
 from qiita_db.user import User
 from qiita_db.util import get_count, check_count, get_db_files_base_dir
+from qiita_db.data import PreprocessedData
 from qiita_core.util import qiita_test_checker
 
 
@@ -148,6 +149,64 @@ def test_load_data_from_cmd(self):
                               study_ids)
 
 
+@qiita_test_checker()
+class TestLoadProcessedDataFromCmd(TestCase):
+    def setUp(self):
+        fd, self.otu_table_fp = mkstemp(suffix='_otu_table.biom')
+        close(fd)
+        fd, self.otu_table_2_fp = mkstemp(suffix='_otu_table2.biom')
+        close(fd)
+
+        with open(self.otu_table_fp, "w") as f:
+            f.write("\n")
+        with open(self.otu_table_2_fp, "w") as f:
+            f.write("\n")
+
+        self.files_to_remove = []
+        self.files_to_remove.append(self.otu_table_fp)
+        self.files_to_remove.append(self.otu_table_2_fp)
+
+        self.db_test_processed_data_dir = join(get_db_files_base_dir(),
+                                               'processed_data')
+
+    def tearDown(self):
+        for fp in self.files_to_remove:
+            if exists(fp):
+                remove(fp)
+
+    def test_load_processed_data_from_cmd(self):
+        filepaths = [self.otu_table_fp, self.otu_table_2_fp]
+        filepath_types = ['biom', 'biom']
+
+        initial_processed_data_count = get_count('qiita.processed_data')
+        initial_processed_fp_count = get_count('qiita.processed_filepath')
+        initial_fp_count = get_count('qiita.filepath')
+
+        new = load_processed_data_cmd(filepaths, filepath_types,
+                                      'processed_params_uclust', 1, 1, None)
+        processed_data_id = new.id
+        self.files_to_remove.append(
+            join(self.db_test_processed_data_dir,
+                 '%d_%s' % (processed_data_id, basename(self.otu_table_fp))))
+        self.files_to_remove.append(
+            join(self.db_test_processed_data_dir,
+                 '%d_%s' % (processed_data_id,
+                            basename(self.otu_table_2_fp))))
+
+        self.assertTrue(check_count('qiita.processed_data',
+                                    initial_processed_data_count + 1))
+        self.assertTrue(check_count('qiita.processed_filepath',
+                                    initial_processed_fp_count + 2))
+        self.assertTrue(check_count('qiita.filepath',
+                                    initial_fp_count + 2))
+
+        # Ensure that the ValueError is raised when a filepath_type is not
+        # provided for each and every filepath
+        with self.assertRaises(ValueError):
+            load_processed_data_cmd(filepaths, filepath_types[:-1],
+                                    'processed_params_uclust', 1, 1, None)
+
+
 CONFIG_1 = """[required]
 timeseries_type_id = 1
 metadata_complete = True

diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py
@@ -17,7 +17,7 @@
                            compute_checksum, check_table_cols,
                            check_required_columns, convert_to_id,
                            get_table_cols, get_filetypes, get_filepath_types,
-                           get_count, check_count)
+                           get_count, check_count, get_processed_params_tables)
 
 
 @qiita_test_checker()
@@ -150,6 +150,10 @@ def test_check_count(self):
         self.assertTrue(check_count('qiita.study_person', 3))
         self.assertFalse(check_count('qiita.study_person', 2))
 
+    def test_get_processed_params_tables(self):
+        obs = get_processed_params_tables()
+        self.assertEqual(obs, ['processed_params_uclust'])
+
 
 class UtilTests(TestCase):
     """Tests for the util functions that do not need to access the DB"""

diff --git a/qiita_db/util.py b/qiita_db/util.py
@@ -467,3 +467,17 @@ def check_count(table, exp_count):
     """
     obs_count = get_count(table)
     return obs_count == exp_count
+
+
+def get_processed_params_tables():
+    """Returns a list of all tables starting with "processed_params_"
+
+    Returns
+    -------
+    list of str
+    """
+    sql = ("SELECT * FROM information_schema.tables WHERE table_schema = "
+           "'qiita' AND SUBSTR(table_name, 1, 17) = 'processed_params_'")
+
+    conn = SQLConnectionHandler()
+    return [x[2] for x in conn.execute_fetchall(sql)]
diff --git a/scripts/qiita_db b/scripts/qiita_db
@@ -10,9 +10,10 @@
 
 import click
 
-from qiita_db.util import get_filetypes, get_filepath_types
+from qiita_db.util import (get_filetypes, get_filepath_types,
+                           get_processed_params_tables)
 from qiita_db.commands import (sample_template_adder, make_study_from_cmd,
-                               load_raw_data_cmd)
+                               load_raw_data_cmd, load_processed_data_cmd)
 
 
 @click.group()
@@ -39,6 +40,35 @@ def load_raw_data(fp, fp_type, filetype, study):
     load_raw_data_cmd(fp, fp_type, filetype, study)
 
 
+@qiita_db.command()
+@click.option('--fp', required=True, type=click.Path(resolve_path=True,
+              readable=True, exists=True), multiple=True, help='Path to the '
+              'processed data. This option can be used multilpe times if '
+              'there are multiple processed data files.')
+@click.option('--fp_type', required=True, multiple=True, help='Describes the '
+              'contents of the file. Pass one fp_type per fp.',
+              type=click.Choice(get_filepath_types().keys()))
+@click.option('--processed_params_table', required=True,
+              type=click.Choice(get_processed_params_tables()),
+              help='The table containing the processed parameters used to '
+              'generate this file')
+@click.option('--processed_params_id', required=True, type=int,
+              help='The ID of the row in the processed_params table')
+@click.option('--preprocessed_data_id', type=int, default=None, help='The '
+              'ID of the row in the preprocessed_data table from which '
+              'this processed data was created')
+@click.option('--processed_date', type=str, default=None,
+              help='The date to use as the processed_date. Must be '
+              'interpretable  as a datetime. If None, then the current date '
+              'and time will be used.')
+def load_processed_data(fp, fp_type, processed_params_table,
+                       processed_params_id, preprocessed_data_id,
+                       processed_date):
+    load_processed_data_cmd(fp, fp_type, processed_params_table,
+                            processed_params_id, preprocessed_data_id,
+                            processed_date)
+
+
 @qiita_db.command()
 @click.option('--owner', help="The email address of the owner of the study")
 @click.option('--title', help="The title of the study")