Merge 2b8d608 into a60cec2

qiita-spots · Apr 10, 2017 · 129f0f6 · 129f0f6
2 parents a60cec2 + 2b8d608
commit 129f0f6
Show file tree

Hide file tree

Showing 12 changed files with 388 additions and 226 deletions.
diff --git a/qiita_core/tests/test_util.py b/qiita_core/tests/test_util.py
@@ -12,7 +12,8 @@
 
 from qiita_core.util import (
     send_email, qiita_test_checker, execute_as_transaction, get_qiita_version,
-    is_test_environment)
+    is_test_environment, get_release_info)
+from qiita_db.meta_util import generate_biom_and_metadata_release
 import qiita_db as qdb
 
 
@@ -64,6 +65,21 @@ def test_get_qiita_version(self):
         # testing just the version
         self.assertEqual(exp_version, qdb.__version__)
 
+    def test_get_release_info(self):
+        # making sure there is a release
+        generate_biom_and_metadata_release('private')
+        # just checking that is not empty cause the MD5 will change on every
+        # run
+        md5sum, filepath, timestamp = get_release_info('private')
+        self.assertNotEqual(md5sum, '')
+        self.assertNotEqual(filepath, '')
+        self.assertNotEqual(timestamp, '')
+
+        md5sum, filepath, timestamp = get_release_info('public')
+        self.assertEqual(md5sum, '')
+        self.assertEqual(filepath, '')
+        self.assertEqual(timestamp, '')
+
 
 if __name__ == '__main__':
     main()
diff --git a/qiita_core/util.py b/qiita_core/util.py
@@ -11,6 +11,7 @@
 from os.path import dirname
 from git import Repo
 from git.exc import InvalidGitRepositoryError
+from moi import r_client
 
 from qiita_core.qiita_settings import qiita_config
 from qiita_pet import __version__ as qiita_pet_lib_version
@@ -141,3 +142,32 @@ def get_qiita_version():
         sha = ''
 
     return (qiita_pet_lib_version, sha)
+
+
+def get_release_info(study_status='public'):
+    """Returns the study status release MD5
+
+    Parameters
+    ----------
+    study_status : str, optional
+        The study status to search for. Note that this should always be set
+        to 'public' but having this exposed helps with testing. The other
+        options are 'private' and 'sandbox'
+
+    Returns
+    ------
+    str, str, str
+        The release MD5, filepath and timestamp
+    """
+    portal = qiita_config.portal
+    md5sum = r_client.get('%s:release:%s:md5sum' % (portal, study_status))
+    filepath = r_client.get('%s:release:%s:filepath' % (portal, study_status))
+    timestamp = r_client.get('%s:release:%s:time' % (portal, study_status))
+    if md5sum is None:
+        md5sum = ''
+    if filepath is None:
+        filepath = ''
+    if timestamp is None:
+        timestamp = ''
+
+    return md5sum, filepath, timestamp
diff --git a/qiita_db/meta_util.py b/qiita_db/meta_util.py
@@ -25,7 +25,8 @@
 from __future__ import division
 
 from moi import r_client
-from os import stat
+from os import stat, makedirs, rename
+from os.path import join, relpath, exists
 from time import strftime, localtime
 import matplotlib.pyplot as plt
 import matplotlib as mpl
@@ -34,8 +35,11 @@
 from StringIO import StringIO
 from future.utils import viewitems
 from datetime import datetime
+from tarfile import open as topen, TarInfo
+from hashlib import md5
 
 from qiita_core.qiita_settings import qiita_config
+from qiita_core.configuration_manager import ConfigurationManager
 import qiita_db as qdb
 
 
@@ -332,3 +336,99 @@ def get_lat_longs():
         qdb.sql_connection.TRN.add(sql)
 
         return qdb.sql_connection.TRN.execute_fetchindex()
+
+
+def generate_biom_and_metadata_release(study_status='public'):
+    """Generate a list of biom/meatadata filepaths and a tgz of those files
+
+    Parameters
+    ----------
+    study_status : str, optional
+        The study status to search for. Note that this should always be set
+        to 'public' but having this exposed helps with testing. The other
+        options are 'private' and 'sandbox'
+    """
+    studies = qdb.study.Study.get_by_status(study_status)
+    qiita_config = ConfigurationManager()
+    working_dir = qiita_config.working_dir
+    portal = qiita_config.portal
+    bdir = qdb.util.get_db_files_base_dir()
+    time = datetime.now().strftime('%m-%d-%y %H:%M:%S')
+
+    data = []
+    for s in studies:
+        # [0] latest is first, [1] only getting the filepath
+        sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)
+
+        for a in s.artifacts(artifact_type='BIOM'):
+            if a.processing_parameters is None:
+                continue
+
+            cmd_name = a.processing_parameters.command.name
+
+            # this loop is necessary as in theory an artifact can be
+            # generated from multiple prep info files
+            human_cmd = []
+            for p in a.parents:
+                pp = p.processing_parameters
+                pp_cmd_name = pp.command.name
+                if pp_cmd_name == 'Trimming':
+                    human_cmd.append('%s @ %s' % (
+                        cmd_name, str(pp.values['length'])))
+                else:
+                    human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name))
+            human_cmd = ', '.join(human_cmd)
+
+            for _, fp, fp_type in a.filepaths:
+                if fp_type != 'biom' or 'only-16s' in fp:
+                    continue
+                fp = relpath(fp, bdir)
+                # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
+                #          human readable name)
+                for pt in a.prep_templates:
+                    for _, prep_fp in pt.get_filepaths():
+                        if 'qiime' not in prep_fp:
+                            break
+                    prep_fp = relpath(prep_fp, bdir)
+                    data.append((fp, sample_fp, prep_fp, a.id, human_cmd))
+
+    # writing text and tgz file
+    ts = datetime.now().strftime('%m%d%y-%H%M%S')
+    tgz_dir = join(working_dir, 'releases')
+    if not exists(tgz_dir):
+        makedirs(tgz_dir)
+    tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
+    tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
+    txt_hd = StringIO()
+    with topen(tgz_name, "w|gz") as tgz:
+        # writing header for txt
+        txt_hd.write(
+            "biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n")
+        for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data:
+            txt_hd.write("%s\t%s\t%s\t%s\t%s\n" % (
+                biom_fp, sample_fp, prep_fp, artifact_id, human_cmd))
+            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
+            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
+            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)
+
+        txt_hd.seek(0)
+        info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
+        info.size = len(txt_hd.buf)
+        tgz.addfile(tarinfo=info, fileobj=txt_hd)
+
+    with open(tgz_name, "rb") as f:
+        md5sum = md5()
+        for c in iter(lambda: f.read(4096), b""):
+            md5sum.update(c)
+
+    rename(tgz_name, tgz_name_final)
+
+    vals = [
+        ('filepath', tgz_name_final[len(working_dir):], r_client.set),
+        ('md5sum', md5sum.hexdigest(), r_client.set),
+        ('time', time, r_client.set)]
+    for k, v, f in vals:
+        redis_key = '%s:release:%s:%s' % (portal, study_status, k)
+        # important to "flush" variables to avoid errors
+        r_client.delete(redis_key)
+        f(redis_key, v)
diff --git a/qiita_db/test/test_meta_util.py b/qiita_db/test/test_meta_util.py
@@ -8,6 +8,9 @@
 
 from unittest import TestCase, main
 import numpy.testing as npt
+from tarfile import open as topen
+from os import remove
+from os.path import exists, join
 
 import pandas as pd
 
@@ -22,9 +25,13 @@
 class MetaUtilTests(TestCase):
     def setUp(self):
         self.old_portal = qiita_config.portal
+        self.files_to_remove = []
 
     def tearDown(self):
         qiita_config.portal = self.old_portal
+        for fp in self.files_to_remove:
+            if exists(fp):
+                remove(fp)
 
     def _set_artifact_private(self):
         self.conn_handler.execute(
@@ -227,6 +234,164 @@ def test_update_redis_stats(self):
             redis_key = '%s:stats:%s' % (portal, k)
             self.assertEqual(f(redis_key), exp)
 
+    def test_generate_biom_and_metadata_release(self):
+        level = 'private'
+        qdb.meta_util.generate_biom_and_metadata_release(level)
+        portal = qiita_config.portal
+        working_dir = qiita_config.working_dir
+
+        vals = [
+            ('filepath', r_client.get),
+            ('md5sum', r_client.get),
+            ('time', r_client.get)]
+        # we are storing the [0] filepath, [1] md5sum and [2] time but we are
+        # only going to check the filepath contents so ignoring the others
+        tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0]))
+        tgz = join(working_dir, tgz)
+
+        self.files_to_remove.extend([tgz])
+
+        tmp = topen(tgz, "r:gz")
+        tgz_obs = [ti.name for ti in tmp]
+        tmp.close()
+        # files names might change due to updates and patches so just check
+        # that the prefix exists.
+        fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom'
+        self.assertTrue(fn in tgz_obs)
+        tgz_obs.remove(fn)
+        # yes, this file is there twice
+        self.assertTrue(fn in tgz_obs)
+        tgz_obs.remove(fn)
+        # let's check the next biom
+        fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.'
+              'biom')
+        self.assertTrue(fn in tgz_obs)
+        tgz_obs.remove(fn)
+        # now let's check prep info files based on their suffix, just take
+        # the first one and check/rm the occurances of that file
+        fn_prep = [f for f in tgz_obs
+                   if f.startswith('templates/1_prep_1_')][0]
+        # 3 times
+        self.assertTrue(fn_prep in tgz_obs)
+        tgz_obs.remove(fn_prep)
+        self.assertTrue(fn_prep in tgz_obs)
+        tgz_obs.remove(fn_prep)
+        self.assertTrue(fn_prep in tgz_obs)
+        tgz_obs.remove(fn_prep)
+        fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0]
+        # 3 times
+        self.assertTrue(fn_sample in tgz_obs)
+        tgz_obs.remove(fn_sample)
+        self.assertTrue(fn_sample in tgz_obs)
+        tgz_obs.remove(fn_sample)
+        self.assertTrue(fn_sample in tgz_obs)
+        tgz_obs.remove(fn_sample)
+        # now we should only have the text file
+        txt = tgz_obs.pop()
+        # now it should be empty
+        self.assertEqual(tgz_obs, [])
+
+        tmp = topen(tgz, "r:gz")
+        fhd = tmp.extractfile(txt)
+        txt_obs = fhd.readlines()
+        tmp.close()
+        txt_exp = [
+            'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
+            'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
+            '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n'
+            % (fn_sample, fn_prep),
+            'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
+            '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n'
+            % (fn_sample, fn_prep),
+            'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
+            'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n'
+            % (fn_sample, fn_prep)]
+        self.assertEqual(txt_obs, txt_exp)
+
+        # whatever the configuration was, we will change to settings so we can
+        # test the other option when dealing with the end '/'
+        with qdb.sql_connection.TRN:
+            qdb.sql_connection.TRN.add(
+                "SELECT base_data_dir FROM settings")
+            obdr = qdb.sql_connection.TRN.execute_fetchlast()
+            if obdr[-1] == '/':
+                bdr = obdr[:-1]
+            else:
+                bdr = obdr + '/'
+
+            qdb.sql_connection.TRN.add(
+                "UPDATE settings SET base_data_dir = '%s'" % bdr)
+            bdr = qdb.sql_connection.TRN.execute()
+
+        qdb.meta_util.generate_biom_and_metadata_release(level)
+        # we are storing the [0] filepath, [1] md5sum and [2] time but we are
+        # only going to check the filepath contents so ignoring the others
+        tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0]))
+        tgz = join(working_dir, tgz)
+
+        tmp = topen(tgz, "r:gz")
+        tgz_obs = [ti.name for ti in tmp]
+        tmp.close()
+        # files names might change due to updates and patches so just check
+        # that the prefix exists.
+        fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom'
+        self.assertTrue(fn in tgz_obs)
+        tgz_obs.remove(fn)
+        # yes, this file is there twice
+        self.assertTrue(fn in tgz_obs)
+        tgz_obs.remove(fn)
+        # let's check the next biom
+        fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.'
+              'biom')
+        self.assertTrue(fn in tgz_obs)
+        tgz_obs.remove(fn)
+        # now let's check prep info files based on their suffix, just take
+        # the first one and check/rm the occurances of that file
+        fn_prep = [f for f in tgz_obs
+                   if f.startswith('templates/1_prep_1_')][0]
+        # 3 times
+        self.assertTrue(fn_prep in tgz_obs)
+        tgz_obs.remove(fn_prep)
+        self.assertTrue(fn_prep in tgz_obs)
+        tgz_obs.remove(fn_prep)
+        self.assertTrue(fn_prep in tgz_obs)
+        tgz_obs.remove(fn_prep)
+        fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0]
+        # 3 times
+        self.assertTrue(fn_sample in tgz_obs)
+        tgz_obs.remove(fn_sample)
+        self.assertTrue(fn_sample in tgz_obs)
+        tgz_obs.remove(fn_sample)
+        self.assertTrue(fn_sample in tgz_obs)
+        tgz_obs.remove(fn_sample)
+        # now we should only have the text file
+        txt = tgz_obs.pop()
+        # now it should be empty
+        self.assertEqual(tgz_obs, [])
+
+        tmp = topen(tgz, "r:gz")
+        fhd = tmp.extractfile(txt)
+        txt_obs = fhd.readlines()
+        tmp.close()
+        txt_exp = [
+            'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
+            'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
+            '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n'
+            % (fn_sample, fn_prep),
+            'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
+            '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n'
+            % (fn_sample, fn_prep),
+            'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
+            'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n'
+            % (fn_sample, fn_prep)]
+        self.assertEqual(txt_obs, txt_exp)
+
+        # returning configuration
+        with qdb.sql_connection.TRN:
+                    qdb.sql_connection.TRN.add(
+                        "UPDATE settings SET base_data_dir = '%s'" % obdr)
+                    bdr = qdb.sql_connection.TRN.execute()
+
 
 EXP_LAT_LONG = (
     '[[60.1102854322, 74.7123248382], [23.1218032799, 42.838497795],'