Skip to content

Commit

Permalink
Merge pull request #129 from adamrp/load_processed_data
Browse files Browse the repository at this point in the history
Load processed data
  • Loading branch information
antgonza committed Jun 20, 2014
2 parents ed57160 + 067767b commit 2dc78a3
Show file tree
Hide file tree
Showing 8 changed files with 1,708 additions and 1,658 deletions.
50 changes: 49 additions & 1 deletion qiita_db/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from dateutil.parser import parse
import pandas as pd
from functools import partial
try:
Expand All @@ -18,7 +19,7 @@
from .study import Study, StudyPerson
from .user import User
from .util import get_filetypes, get_filepath_types
from .data import RawData
from .data import RawData, PreprocessedData, ProcessedData
from .metadata_template import SampleTemplate


Expand Down Expand Up @@ -108,3 +109,50 @@ def load_raw_data_cmd(filepaths, filepath_types, filetype, study_ids):

return RawData.create(filetype_id, list(zip(filepaths, filepath_types)),
studies)


def load_processed_data_cmd(fps, fp_types, processed_params_table_name,
processed_params_id, preprocessed_data_id=None,
processed_date=None):
"""Add a new processed data entry
Parameters
----------
fps : list of str
Paths to the processed data files to associate with the ProcessedData
object
fp_types: list of str
The types of files, one per fp
processed_params_table_name : str
The name of the processed_params_ table to use
processed_params_id : int
The ID of the row in the processed_params_ table
preprocessed_data_id : int, optional
Defaults to ``None``. The ID of the row in the preprocessed_data table.
processed_date : str, optional
Defaults to ``None``. The date and time to use as the processing date.
Must be interpretable as a datetime object
Returns
-------
qiita_db.ProcessedData
The newly created `qiita_db.ProcessedData` object
"""
if len(fps) != len(fp_types):
raise ValueError("Please pass exactly one fp_type for each "
"and every fp")

fp_types_dict = get_filepath_types()
fp_types = [fp_types_dict[x] for x in fp_types]

if preprocessed_data_id is not None:
preprocessed_data = PreprocessedData(preprocessed_data_id)
else:
preprocessed_data = None

if processed_date is not None:
processed_date = parse(processed_date)

return ProcessedData.create(processed_params_table_name,
processed_params_id, list(zip(fps, fp_types)),
preprocessed_data, processed_date)
6 changes: 2 additions & 4 deletions qiita_db/support_files/qiita-db.dbs
Original file line number Diff line number Diff line change
Expand Up @@ -635,10 +635,8 @@ Linked by y being raw_data_id from raw data table.</comment>
<table name="processed_filepath" >
<column name="processed_data_id" type="bigint" jt="-5" mandatory="y" />
<column name="filepath_id" type="bigint" jt="-5" mandatory="y" />
<index name="pk_processed_data_filepath" unique="UNIQUE" >
<index name="idx_processed_filepath" unique="PRIMARY_KEY" >
<column name="processed_data_id" />
</index>
<index name="idx_processed_data_filepath" unique="NORMAL" >
<column name="filepath_id" />
</index>
<fk name="fk_processed_data_filepath" to_schema="qiita" to_table="processed_data" >
Expand Down Expand Up @@ -1262,8 +1260,8 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="raw_data" color="d0def5" x="1230" y="480" />
<entity schema="qiita" name="raw_preprocessed_data" color="b2cdf7" x="1230" y="585" />
<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="705" />
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
<group name="Group_analyses" color="c4e0f9" >
<comment>analysis tables</comment>
<entity schema="qiita" name="analysis" />
Expand Down
3,191 changes: 1,545 additions & 1,646 deletions qiita_db/support_files/qiita-db.html

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions qiita_db/support_files/qiita-db.sql
Original file line number Diff line number Diff line change
Expand Up @@ -456,13 +456,11 @@ CREATE INDEX idx_preprocessed_processed_data_1 ON qiita.preprocessed_processed_d
CREATE TABLE qiita.processed_filepath (
processed_data_id bigint NOT NULL,
filepath_id bigint NOT NULL,
CONSTRAINT pk_processed_data_filepath UNIQUE ( processed_data_id ) ,
CONSTRAINT idx_processed_filepath PRIMARY KEY ( processed_data_id, filepath_id ),
CONSTRAINT fk_processed_data_filepath FOREIGN KEY ( processed_data_id ) REFERENCES qiita.processed_data( processed_data_id ) ,
CONSTRAINT fk_processed_data_filepath_0 FOREIGN KEY ( filepath_id ) REFERENCES qiita.filepath( filepath_id )
);

CREATE INDEX idx_processed_data_filepath ON qiita.processed_filepath ( filepath_id );

CREATE TABLE qiita.processed_params_uclust (
processed_params_id bigserial NOT NULL,
reference_id bigint NOT NULL,
Expand Down
61 changes: 60 additions & 1 deletion qiita_db/test/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@
from configparser import NoOptionError

from qiita_db.commands import (make_study_from_cmd, load_raw_data_cmd,
sample_template_adder)
sample_template_adder, load_processed_data_cmd)
from qiita_db.study import Study, StudyPerson
from qiita_db.user import User
from qiita_db.util import get_count, check_count, get_db_files_base_dir
from qiita_db.data import PreprocessedData
from qiita_core.util import qiita_test_checker


Expand Down Expand Up @@ -148,6 +149,64 @@ def test_load_data_from_cmd(self):
study_ids)


@qiita_test_checker()
class TestLoadProcessedDataFromCmd(TestCase):
def setUp(self):
fd, self.otu_table_fp = mkstemp(suffix='_otu_table.biom')
close(fd)
fd, self.otu_table_2_fp = mkstemp(suffix='_otu_table2.biom')
close(fd)

with open(self.otu_table_fp, "w") as f:
f.write("\n")
with open(self.otu_table_2_fp, "w") as f:
f.write("\n")

self.files_to_remove = []
self.files_to_remove.append(self.otu_table_fp)
self.files_to_remove.append(self.otu_table_2_fp)

self.db_test_processed_data_dir = join(get_db_files_base_dir(),
'processed_data')

def tearDown(self):
for fp in self.files_to_remove:
if exists(fp):
remove(fp)

def test_load_processed_data_from_cmd(self):
filepaths = [self.otu_table_fp, self.otu_table_2_fp]
filepath_types = ['biom', 'biom']

initial_processed_data_count = get_count('qiita.processed_data')
initial_processed_fp_count = get_count('qiita.processed_filepath')
initial_fp_count = get_count('qiita.filepath')

new = load_processed_data_cmd(filepaths, filepath_types,
'processed_params_uclust', 1, 1, None)
processed_data_id = new.id
self.files_to_remove.append(
join(self.db_test_processed_data_dir,
'%d_%s' % (processed_data_id, basename(self.otu_table_fp))))
self.files_to_remove.append(
join(self.db_test_processed_data_dir,
'%d_%s' % (processed_data_id,
basename(self.otu_table_2_fp))))

self.assertTrue(check_count('qiita.processed_data',
initial_processed_data_count + 1))
self.assertTrue(check_count('qiita.processed_filepath',
initial_processed_fp_count + 2))
self.assertTrue(check_count('qiita.filepath',
initial_fp_count + 2))

# Ensure that the ValueError is raised when a filepath_type is not
# provided for each and every filepath
with self.assertRaises(ValueError):
load_processed_data_cmd(filepaths, filepath_types[:-1],
'processed_params_uclust', 1, 1, None)


CONFIG_1 = """[required]
timeseries_type_id = 1
metadata_complete = True
Expand Down
6 changes: 5 additions & 1 deletion qiita_db/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
compute_checksum, check_table_cols,
check_required_columns, convert_to_id,
get_table_cols, get_filetypes, get_filepath_types,
get_count, check_count)
get_count, check_count, get_processed_params_tables)


@qiita_test_checker()
Expand Down Expand Up @@ -150,6 +150,10 @@ def test_check_count(self):
self.assertTrue(check_count('qiita.study_person', 3))
self.assertFalse(check_count('qiita.study_person', 2))

def test_get_processed_params_tables(self):
obs = get_processed_params_tables()
self.assertEqual(obs, ['processed_params_uclust'])


class UtilTests(TestCase):
"""Tests for the util functions that do not need to access the DB"""
Expand Down
14 changes: 14 additions & 0 deletions qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,3 +467,17 @@ def check_count(table, exp_count):
"""
obs_count = get_count(table)
return obs_count == exp_count


def get_processed_params_tables():
"""Returns a list of all tables starting with "processed_params_"
Returns
-------
list of str
"""
sql = ("SELECT * FROM information_schema.tables WHERE table_schema = "
"'qiita' AND SUBSTR(table_name, 1, 17) = 'processed_params_'")

conn = SQLConnectionHandler()
return [x[2] for x in conn.execute_fetchall(sql)]
34 changes: 32 additions & 2 deletions scripts/qiita_db
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@

import click

from qiita_db.util import get_filetypes, get_filepath_types
from qiita_db.util import (get_filetypes, get_filepath_types,
get_processed_params_tables)
from qiita_db.commands import (sample_template_adder, make_study_from_cmd,
load_raw_data_cmd)
load_raw_data_cmd, load_processed_data_cmd)


@click.group()
Expand All @@ -39,6 +40,35 @@ def load_raw_data(fp, fp_type, filetype, study):
load_raw_data_cmd(fp, fp_type, filetype, study)


@qiita_db.command()
@click.option('--fp', required=True, type=click.Path(resolve_path=True,
readable=True, exists=True), multiple=True, help='Path to the '
'processed data. This option can be used multilpe times if '
'there are multiple processed data files.')
@click.option('--fp_type', required=True, multiple=True, help='Describes the '
'contents of the file. Pass one fp_type per fp.',
type=click.Choice(get_filepath_types().keys()))
@click.option('--processed_params_table', required=True,
type=click.Choice(get_processed_params_tables()),
help='The table containing the processed parameters used to '
'generate this file')
@click.option('--processed_params_id', required=True, type=int,
help='The ID of the row in the processed_params table')
@click.option('--preprocessed_data_id', type=int, default=None, help='The '
'ID of the row in the preprocessed_data table from which '
'this processed data was created')
@click.option('--processed_date', type=str, default=None,
help='The date to use as the processed_date. Must be '
'interpretable as a datetime. If None, then the current date '
'and time will be used.')
def load_processed_data(fp, fp_type, processed_params_table,
processed_params_id, preprocessed_data_id,
processed_date):
load_processed_data_cmd(fp, fp_type, processed_params_table,
processed_params_id, preprocessed_data_id,
processed_date)


@qiita_db.command()
@click.option('--owner', help="The email address of the owner of the study")
@click.option('--title', help="The title of the study")
Expand Down

0 comments on commit 2dc78a3

Please sign in to comment.