Skip to content

Commit

Permalink
Merge pull request #1494 from josenavas/ebi-improvements
Browse files Browse the repository at this point in the history
Solving merge conflicts
  • Loading branch information
antgonza committed Oct 10, 2015
2 parents 0a47d89 + e17520a commit 1cee0a3
Show file tree
Hide file tree
Showing 8 changed files with 290 additions and 33 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -4,6 +4,7 @@ Version 0.2.0-dev (Changes since version 0.2.0 go here)
-------------------------------------------------------

* Users can now change values and add samples and/or columns to sample and prep templates using the <kbd>Update</kbd> button (see the prep template and sample template tabs).
* The raw files of a RawData can be now updated using the `qiita db update_raw_data` CLI command.
* instrument_model is now a required prep template column for EBI submissions.

Version 0.2.0 (2015-08-25)
Expand Down
72 changes: 70 additions & 2 deletions qiita_db/commands.py
Expand Up @@ -18,7 +18,7 @@
from .study import Study, StudyPerson
from .user import User
from .util import (get_filetypes, get_filepath_types, compute_checksum,
convert_to_id)
convert_to_id, move_filepaths_to_upload_folder)
from .data import RawData, PreprocessedData, ProcessedData
from .metadata_template import (SampleTemplate, PrepTemplate,
load_template_to_dataframe)
Expand Down Expand Up @@ -302,6 +302,73 @@ def load_parameters_from_cmd(name, fp, table):
return constructor.create(name, **params)


def update_raw_data_from_cmd(filepaths, filepath_types, study_id, rd_id=None):
"""Updates the raw data of the study 'study_id'
Parameters
----------
filepaths : iterable of str
Paths to the raw data files
filepath_types : iterable of str
Describes the contents of the files
study_id : int
The study_id of the study to be updated
rd_id : int, optional
The id of the raw data to be updated. If not provided, the raw data
with lowest id in the study will be updated
Returns
-------
qiita_db.data.RawData
Raises
------
ValueError
If 'filepaths' and 'filepath_types' do not have the same length
If the study does not have any raw data
If rd_id is provided and it does not belong to the given study
"""
if len(filepaths) != len(filepath_types):
raise ValueError("Please provide exactly one filepath_type for each "
"and every filepath")
with TRN:
study = Study(study_id)
raw_data_ids = study.raw_data()
if not raw_data_ids:
raise ValueError("Study %d does not have any raw data" % study_id)

if rd_id:
if rd_id not in raw_data_ids:
raise ValueError(
"The raw data %d does not exist in the study %d. Available"
" raw data: %s"
% (rd_id, study_id, ', '.join(map(str, raw_data_ids))))
raw_data = RawData(rd_id)
else:
raw_data = RawData(sorted(raw_data_ids)[0])

filepath_types_dict = get_filepath_types()
try:
filepath_types = [filepath_types_dict[x] for x in filepath_types]
except KeyError:
supported_types = filepath_types_dict.keys()
unsupported_types = set(filepath_types).difference(supported_types)
raise ValueError(
"Some filepath types provided are not recognized (%s). "
"Please choose from: %s"
% (', '.join(unsupported_types), ', '.join(supported_types)))

fps = raw_data.get_filepaths()
sql = "DELETE FROM qiita.raw_filepath WHERE raw_data_id = %s"
TRN.add(sql, [raw_data.id])
TRN.execute()
move_filepaths_to_upload_folder(study_id, fps)

raw_data.add_filepaths(list(zip(filepaths, filepath_types)))

return raw_data


def update_preprocessed_data_from_cmd(sl_out_dir, study_id, ppd_id=None):
"""Updates the preprocessed data of the study 'study_id'
Expand Down Expand Up @@ -346,7 +413,8 @@ def update_preprocessed_data_from_cmd(sl_out_dir, study_id, ppd_id=None):
study = Study(study_id)
ppds = study.preprocessed_data()
if not ppds:
raise ValueError("Study %s does not have any preprocessed data")
raise ValueError("Study %s does not have any preprocessed data",
study_id)

if ppd_id:
if ppd_id not in ppds:
Expand Down
12 changes: 4 additions & 8 deletions qiita_db/metadata_template/base_metadata_template.py
Expand Up @@ -1162,19 +1162,15 @@ def update(self, md_template):
% ', '.join(columns_diff))

# In order to speed up some computation, let's compare only the
# common columns. current_map.columns is a superset of
# new_map.columns, so this will not fail
current_map = current_map[new_map.columns]
# common columns and rows. current_map.columns and
# current_map.index are supersets of new_map.columns and
# new_map.index, respectivelly, so this will not fail
current_map = current_map[new_map.columns].loc[new_map.index]

# Get the values that we need to change
# diff_map is a DataFrame that hold boolean values. If a cell is
# True, means that the new_map is different from the current_map
# while False means that the cell has the same value
# In order to compare them, they've to be identically labeled, so
# we need to sort the 'index' axis to be identically labeled. The
# 'column' axis is already the same given the previous line of code
current_map.sort_index(axis='index', inplace=True)
new_map.sort_index(axis='index', inplace=True)
diff_map = current_map != new_map
# ne_stacked holds a MultiIndexed DataFrame in which the first
# level of indexing is the sample_name and the second one is the
Expand Down
13 changes: 13 additions & 0 deletions qiita_db/metadata_template/test/test_sample_template.py
Expand Up @@ -1227,6 +1227,19 @@ def test_update(self):
with self.assertRaises(QiitaDBError):
st.update(self.metadata_dict_updated_column_error)

def test_update_fewer_samples(self):
"""Updates using a dataframe with less samples that in the DB"""
st = SampleTemplate.create(self.metadata, self.new_study)
new_metadata = pd.DataFrame.from_dict(
{'Sample1': {'physical_specimen_location': 'CHANGE'}},
orient='index')
exp = {s_id: st[s_id]._to_dict() for s_id in st}
s_id = '%d.Sample1' % self.new_study.id
exp[s_id]['physical_specimen_location'] = 'CHANGE'
npt.assert_warns(QiitaDBWarning, st.update, new_metadata)
obs = {s_id: st[s_id]._to_dict() for s_id in st}
self.assertEqual(obs, exp)

def test_update_numpy(self):
"""Update values in existing mapping file with numpy values"""
metadata_dict = {
Expand Down
112 changes: 110 additions & 2 deletions qiita_db/test/test_commands.py
Expand Up @@ -14,6 +14,7 @@
from future.utils.six import StringIO
from future import standard_library
from functools import partial
from operator import itemgetter

import pandas as pd

Expand All @@ -23,13 +24,15 @@
load_processed_data_cmd,
load_preprocessed_data_from_cmd,
load_parameters_from_cmd,
update_raw_data_from_cmd,
update_preprocessed_data_from_cmd)
from qiita_db.environment_manager import patch
from qiita_db.study import Study, StudyPerson
from qiita_db.user import User
from qiita_db.data import PreprocessedData
from qiita_db.data import PreprocessedData, RawData
from qiita_db.util import (get_count, check_count, get_db_files_base_dir,
get_mountpoint)
get_mountpoint, compute_checksum,
get_files_from_uploads_folders)
from qiita_db.metadata_template import PrepTemplate
from qiita_core.util import qiita_test_checker
from qiita_ware.processing_pipeline import generate_demux_file
Expand Down Expand Up @@ -453,6 +456,111 @@ def test_python_patch(self):
self._assert_current_patch('10.sql')


@qiita_test_checker()
class TestUpdateRawDataFromCmd(TestCase):
def setUp(self):
fd, seqs_fp = mkstemp(suffix='_seqs.fastq')
close(fd)
fd, barcodes_fp = mkstemp(suffix='_barcodes.fastq')
close(fd)
self.filepaths = [seqs_fp, barcodes_fp]
self.checksums = []
for fp in sorted(self.filepaths):
with open(fp, 'w') as f:
f.write("%s\n" % fp)
self.checksums.append(compute_checksum(fp))
self.filepaths_types = ["raw_forward_seqs", "raw_barcodes"]
self._clean_up_files = [seqs_fp, barcodes_fp]

info = {
"timeseries_type_id": 1,
"metadata_complete": True,
"mixs_compliant": True,
"number_samples_collected": 25,
"number_samples_promised": 28,
"study_alias": "FCM",
"study_description": "Microbiome of people who eat nothing but "
"fried chicken",
"study_abstract": "Exploring how a high fat diet changes the "
"gut microbiome",
"emp_person_id": StudyPerson(2),
"principal_investigator_id": StudyPerson(3),
"lab_person_id": StudyPerson(1)
}
self.new_study = Study.create(User("test@foo.bar"),
"Update raw data test",
efo=[1], info=info)
self.study = Study(1)
# The files for the RawData object attached to study 1 does not exist.
# Create them so we can actually perform the tests
for _, fp, _ in RawData(1).get_filepaths():
with open(fp, 'w') as f:
f.write('\n')
self._clean_up_files.append(fp)

self.uploaded_files = get_files_from_uploads_folders(
str(self.study.id))

def tearDown(self):
new_uploaded_files = get_files_from_uploads_folders(str(self.study.id))
new_files = set(new_uploaded_files).difference(self.uploaded_files)
path_builder = partial(join, get_mountpoint("uploads")[0][1], '1')
for _, fp in new_files:
self._clean_up_files.append(path_builder(fp))
for f in self._clean_up_files:
if exists(f):
remove(f)

def test_update_raw_data_from_cmd_diff_length(self):
with self.assertRaises(ValueError):
update_raw_data_from_cmd(self.filepaths[1:], self.filepaths_types,
self.study.id)
with self.assertRaises(ValueError):
update_raw_data_from_cmd(self.filepaths, self.filepaths_types[1:],
self.study.id)

def test_update_raw_data_from_cmd_no_raw_data(self):
with self.assertRaises(ValueError):
update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
self.new_study.id)

def test_update_raw_data_from_cmd_wrong_raw_data_id(self):
# Using max(raw_data_ids) + 1 to make sure that the raw data id
# passed does not belong to the study
with self.assertRaises(ValueError):
update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
self.study.id,
max(self.study.raw_data()) + 1)

def test_update_raw_data_from_cmd(self):
rd = update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
self.study.id)
# Make sure that we are cleaning the environment
for _, fp, _ in rd.get_filepaths():
self._clean_up_files.append(fp)

# The checkums are in filepath order. If we sort the rd.get_filepath()
# result by the filepath (itemgetter(1)) we will get them in the same
# order, so the checksums will not fail
for obs, exp in zip(sorted(rd.get_filepaths(), key=itemgetter(1)),
self.checksums):
self.assertEqual(compute_checksum(obs[1]), exp)

def test_update_raw_data_from_cmd_rd_id(self):
rd = update_raw_data_from_cmd(self.filepaths, self.filepaths_types,
self.study.id, self.study.raw_data()[0])
# Make sure that we are cleaning the environment
for _, fp, _ in rd.get_filepaths():
self._clean_up_files.append(fp)

# The checkums are in filepath order. If we sort the rd.get_filepath()
# result by the filepath (itemgetter(1)) we will get them in the same
# order, so the checksums will not fail
for obs, exp in zip(sorted(rd.get_filepaths(), key=itemgetter(1)),
self.checksums):
self.assertEqual(compute_checksum(obs[1]), exp)


@qiita_test_checker()
class TestUpdatePreprocessedDataFromCmd(TestCase):
def setUp(self):
Expand Down
40 changes: 22 additions & 18 deletions qiita_ware/processing_pipeline.py
Expand Up @@ -24,12 +24,13 @@ def _get_qiime_minimal_mapping(prep_template, out_dir):
"""Generates a minimal QIIME-compliant mapping file for split libraries
The columns of the generated file are, in order: SampleID, BarcodeSequence,
LinkerPrimerSequence, Description. All values are taken from the prep
template except for Description, which always receive the value "Qiita MMF"
LinkerPrimerSequence, [ReverseLinkerPrimer] Description. All values are
taken from the prep template except for Description, which always receive
the value "Qiita MMF"
Parameters
----------
prep_template : PrepTemplate
prep_template : qiita_db.metadata_template.PrepTemplate
The prep template from which we need to generate the minimal mapping
out_dir : str
Path to the output directory
Expand All @@ -40,42 +41,45 @@ def _get_qiime_minimal_mapping(prep_template, out_dir):
The paths to the qiime minimal mapping files
"""
from functools import partial
from collections import defaultdict
from os.path import join
import pandas as pd

# The prep templates has a QIIME mapping file, get it
qiime_map = pd.read_csv(prep_template.qiime_map_fp, sep='\t',
keep_default_na=False, na_values=['unknown'],
index_col=False,
converters=defaultdict(lambda: str))
qiime_map.set_index('#SampleID', inplace=True, drop=True)
pt_df = prep_template.to_dataframe()

# We use our own description to avoid potential processing problems
qiime_map['Description'] = pd.Series(['Qiita MMF'] * len(qiime_map.index),
index=qiime_map.index)
rename_cols = {
'barcode': 'BarcodeSequence',
'primer': 'LinkerPrimerSequence',
}

# We ensure the order of the columns as QIIME is expecting
if 'ReverseLinkerPrimer' in qiime_map:
# Ensure the order of the columns as QIIME is expecting
if 'reverselinkerprimer' in pt_df:
rename_cols['reverselinkerprimer'] = 'ReverseLinkerPrimer'
cols = ['BarcodeSequence', 'LinkerPrimerSequence',
'ReverseLinkerPrimer', 'Description']
else:
cols = ['BarcodeSequence', 'LinkerPrimerSequence', 'Description']

pt_df.rename(columns=rename_cols, inplace=True)

# Sometimes, the Description column can generate some problems in QIIME,
# depending on its values. We set it up to read Qiita MMF for all rows
pt_df['Description'] = pd.Series(['Qiita MMF'] * len(pt_df.index),
index=pt_df.index)

path_builder = partial(join, out_dir)
if 'run_prefix' in qiime_map:
if 'run_prefix' in pt_df:
# The study potentially has more than 1 lane, so we should generate a
# qiime MMF for each of the lanes. We know how to split the prep
# template based on the run_prefix column
output_fps = []
for prefix, df in qiime_map.groupby('run_prefix'):
for prefix, df in pt_df.groupby('run_prefix'):
df = df[cols]
out_fp = path_builder("%s_MMF.txt" % prefix)
output_fps.append(out_fp)
df.to_csv(out_fp, index_label="#SampleID", sep='\t')
else:
# The study only has one lane, just write the MMF
df = qiime_map[cols]
df = pt_df[cols]
out_fp = path_builder("prep_%d_MMF.txt" % prep_template.id)
output_fps = [out_fp]
df.to_csv(out_fp, index_label="#SampleID", sep='\t')
Expand Down

0 comments on commit 1cee0a3

Please sign in to comment.