Permalink
Browse files

Merge pull request #2812 from antgonza/0119-fixes

fix #2809, fix #2806
  • Loading branch information...
charles-cowart committed Feb 4, 2019
2 parents 36b3ccc + 6f10604 commit bc4eed2e4e2cdc7c42321a7195a9a3df559c7200
@@ -823,7 +823,7 @@ def _common_extend_steps(self, md_template):
new_cols = set(headers).difference(self.categories())

if not new_cols and not new_samples:
return
return None, None

is_extendable, error_msg = self.can_be_extended(new_samples,
new_cols)
@@ -1723,7 +1723,7 @@ def _identify_column_names_with_invalid_characters(cls, column_names):
set of words containing invalid (illegal) characters.
"""
valid_initial_char = letters
valid_rest = set(letters+digits+'_')
valid_rest = set(letters+digits+'_:|')
invalid = []
for s in column_names:
if s[0] not in valid_initial_char:
@@ -84,14 +84,17 @@ def test_identify_invalid_characters(self):
'sampleid',
'sample_id',
'{',
'this|is',
'bla:1',
'bla|2',
'bla1:2|3',
'this&is',
'4column',
'just_fine2'])
self.assertItemsEqual(set(results), {'tax on',
'bla.',
'.',
'{',
'this|is',
'this&is',
'4column'})


@@ -838,7 +838,7 @@ def test_clean_validate_template_no_invalid_chars2(self):

def test_clean_validate_template_no_invalid_chars3(self):
ST = qdb.metadata_template.sample_template.SampleTemplate
self.metadata.rename(columns={'taxon_id': 'this|is'}, inplace=True)
self.metadata.rename(columns={'taxon_id': 'this&is'}, inplace=True)
with self.assertRaises(qdb.exceptions.QiitaDBColumnError):
ST._clean_validate_template(self.metadata, 2)

@@ -1779,6 +1779,10 @@ def test_extend_update(self):
st = qdb.metadata_template.sample_template.SampleTemplate.create(
self.metadata, self.new_study)

# test updating with same data, none of the rest of the code/tests
# should change
st.extend_and_update(self.metadata)

self.metadata_dict['Sample4'] = {
'physical_specimen_location': 'location1',
'physical_specimen_remaining': 'true',
@@ -1,8 +1,44 @@
Checklist to send data to EBI-ENA
=================================
.. role:: red

Send data to EBI-ENA
====================

Qiita allows users to deposit their study, sample, experiment and sequence data to the
`European Nucleotide Archive (ENA) <https://www.ebi.ac.uk/ena>`__, which is the permanent data
repository of the `European Bioinformatics Institute (EBI) <https://www.ebi.ac.uk/>`__. Submitting to
this repository will provide you with a unique identifier for your study, which is generally a
requirement for publications. Your study will be housed with all other Qiita submissions
and so we require adherence to the `MiXs standard <http://gensc.org/mixs/>`__.

`Here <https://knightlab.ucsd.edu/wordpress/wp-content/uploads/2016/04/QiitaTemplate_20181218.xlsx>`__ you will find a document outlining these requirements, with examples, when possible.

Note that submissions are time consuming and need full collaboration from the user.
:red:`Do not wait until the last minute to request help.` In general, the best
time to request a submission is when you are writing your paper. Remember that the
data can be submitted to EBI and can be kept private and simply make public when
the paper is accepted. Note that EBI/ENA takes up to 15 days to change the status
from private to public, so consider this when submitting data and your manuscript.

.. note::
For convenience Qiita allows you to upload a QIIME mapping file to process your data. However,
the QIIME mapping file, in general, does not have all the EBI/ENA fields. Thus, you will need to
update your information files (sample or preparation) via the update option. To simplify this process,
you can download the system generated files and add/modify these fields for each file.


EBI-ENA NULL values vocabulary
------------------------------

We support only the following values: *not applicable*, *not collected*, *not provided*, *restricted access*.

For the latest definitions and explanation visit the `EBI/ENA Missing value reporting <http://www.ebi.ac.uk/ena/about/missing-values-reporting>`__.

.. warning::
Column names in your information files cannot be named as a Postgres reserved word. For example, a column cannot be named `CONDITION`, but could instead be named `DISEASE_CONDITION`. For a full list of these reserved words, see this `link <https://www.postgresql.org/docs/9.3/static/sql-keywords-appendix.html>`__.

Checklist
---------

For each preparation that needs to be uploaded to EBI-ENA we will check:

1. Data processing
@@ -27,6 +63,40 @@ For each preparation that needs to be uploaded to EBI-ENA we will check:
7. *elevation*, *latitude*, *longitude*
8. *empo_1*, *empo_2*, *empo_3*

.. table::
:widths: auto

=============== ================= ======================= ================================================================================
empo_1 empo_2 empo_3 Examples
Free-living Non-saline Water (non-saline) fresh water from lake, pond, or river (<5 psu)
Free-living Non-saline Sediment (non-saline) sediment from lake, pond, or river (<5 psu)
Free-living Non-saline Soil (non-saline) soil from forest, grassland, tundra, desert, etc.
Free-living Non-saline Surface (non-saline) biofilm from wet (<5 psu) or dry surface, wood, dust, or microbial mat
Free-living Non-saline Subsurface (non-saline) deep or subsurface environment
Free-living Non-saline Aerosol (non-saline) aerosolized dust or liquid
Free-living Saline Water (saline) salt water from ocean, sea, estuary, mangrove, or coral reef (>5 psu)
Free-living Saline Sediment (saline) sediment from ocean, sea, estuary, mangrove, or beach (>5 psu)
Free-living Saline Hypersaline (saline) water from hypersaline sample or brine (>50 psu)
Free-living Saline Surface (saline) biofilm from wet or underwater surface or microbial mat (>5 psu)
Free-living Saline Aerosol (saline) seaspray or other aerosolized saline material (>5 psu)
Host-associated Animal-associated Animal distal gut feces, stool
Host-associated Animal-associated Animal proximal gut digesta
Host-associated Animal-associated Animal secretion gut intestine, gizzard, crop, lumen, or mucosa
Host-associated Animal-associated Animal surface skin, sebum, mucus, slime
Host-associated Animal-associated Animal corpus tissue of sponge, coral, gill, siphon, carcass, etc. or whole small animal
Host-associated Fungus-associated Fungus corpus tissue of mushroom or other fungi
Host-associated Fungus-associated Fungus surface biofilm of mushroom
Host-associated Plant-associated Plant secretion pollen or sap
Host-associated Plant-associated Plant surface leaf or kelp surface biofilm
Host-associated Plant-associated Plant rhizosphere plant root system, may include some soil
Host-associated Plant-associated Plant corpus tissue of leaf, stem, fruit, or algae
Control Negative Sterile water blank sterile water blank used as negative control for extraction, PCR, and sequencing
Control Positive Mock community known mixed community used as positive control
Control Positive Single strain known single strain control culture
Unknown Contradictory Unknown (contradictory) unknown sample type because other metadata is contradictory
Unknown Missing Unknown (missing) unknown sample type because metadata is unavailable
=============== ================= ======================= ================================================================================

c. Extra minimal information for host associated studies:

1. *host_body_habitat*, *host_body_site*, *host_body_product*

This file was deleted.

Oops, something went wrong.
@@ -38,7 +38,6 @@ Looking for information about submitting your files to EBI? Please see the docum

.. toctree::

europeanbioinformaticsinstitute.rst
checklist-for-ebi-ena-submission

Looking for comparable studies? Please see the document here:
@@ -212,35 +212,26 @@ def submit_EBI(artifact_id, action, send, test=False, test_size=False):
LogEntry.create(
'Runtime', 'The submission: %d is larger than allowed (%d), will '
'try to fix: %d' % (artifact_id, max_size, total_size))
# let's confirm that we are only dealing with the latest samples and
# then convert them to a DataFrame for easier cleanup
new_samples = {
sample for sample, accession in viewitems(
ebi_submission.prep_template.ebi_experiment_accessions)
if accession is None}
new_samples = new_samples.intersection(ebi_submission.samples)
# transform current metadata to dataframe for easier curation
rows = {k: dict(v) for k, v in viewitems(ebi_submission.samples)}
df = pd.DataFrame.from_dict(rows, orient='index')
# remove unique columns and same value in all columns
nunique = df.apply(pd.Series.nunique)
nsamples = len(df.index)
cols_to_drop = set(
nunique[(nunique == 1) | (nunique == nsamples)].index)
# maximize deletion by removing also columns that are almost all the
# same or almost all unique
cols_to_drop = set(
nunique[(nunique <= int(nsamples * .01)) |
(nunique >= int(nsamples * .5))].index)
cols_to_drop = cols_to_drop - {'taxon_id', 'scientific_name',
'description'}
df.drop(columns=cols_to_drop, inplace=True)
# let's overwrite samples
ebi_submission.samples = {k: r.to_dict() for k, r in df.iterrows()}
all_samples = ebi_submission.sample_template.ebi_sample_accessions
samples = {k: all_samples[k] for k in ebi_submission.samples}
ebi_submission.write_xml_file(
ebi_submission.generate_sample_xml(new_samples),
ebi_submission.generate_sample_xml(samples, cols_to_drop),
ebi_submission.sample_xml_fp)
# let's do the same with the prep
ebi_submission.write_xml_file(
ebi_submission.generate_experiment_xml(new_samples),
ebi_submission.experiment_xml_fp)

# now let's recalculate the size to make sure it's fine
new_total_size = sum([stat(tr).st_size
Oops, something went wrong.

0 comments on commit bc4eed2

Please sign in to comment.