Skip to content

Commit

Permalink
Merge 2b8d608 into a60cec2
Browse files Browse the repository at this point in the history
  • Loading branch information
antgonza committed Apr 10, 2017
2 parents a60cec2 + 2b8d608 commit 129f0f6
Show file tree
Hide file tree
Showing 12 changed files with 388 additions and 226 deletions.
18 changes: 17 additions & 1 deletion qiita_core/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

from qiita_core.util import (
send_email, qiita_test_checker, execute_as_transaction, get_qiita_version,
is_test_environment)
is_test_environment, get_release_info)
from qiita_db.meta_util import generate_biom_and_metadata_release
import qiita_db as qdb


Expand Down Expand Up @@ -64,6 +65,21 @@ def test_get_qiita_version(self):
# testing just the version
self.assertEqual(exp_version, qdb.__version__)

def test_get_release_info(self):
# making sure there is a release
generate_biom_and_metadata_release('private')
# just checking that is not empty cause the MD5 will change on every
# run
md5sum, filepath, timestamp = get_release_info('private')
self.assertNotEqual(md5sum, '')
self.assertNotEqual(filepath, '')
self.assertNotEqual(timestamp, '')

md5sum, filepath, timestamp = get_release_info('public')
self.assertEqual(md5sum, '')
self.assertEqual(filepath, '')
self.assertEqual(timestamp, '')


if __name__ == '__main__':
main()
30 changes: 30 additions & 0 deletions qiita_core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from os.path import dirname
from git import Repo
from git.exc import InvalidGitRepositoryError
from moi import r_client

from qiita_core.qiita_settings import qiita_config
from qiita_pet import __version__ as qiita_pet_lib_version
Expand Down Expand Up @@ -141,3 +142,32 @@ def get_qiita_version():
sha = ''

return (qiita_pet_lib_version, sha)


def get_release_info(study_status='public'):
"""Returns the study status release MD5
Parameters
----------
study_status : str, optional
The study status to search for. Note that this should always be set
to 'public' but having this exposed helps with testing. The other
options are 'private' and 'sandbox'
Returns
------
str, str, str
The release MD5, filepath and timestamp
"""
portal = qiita_config.portal
md5sum = r_client.get('%s:release:%s:md5sum' % (portal, study_status))
filepath = r_client.get('%s:release:%s:filepath' % (portal, study_status))
timestamp = r_client.get('%s:release:%s:time' % (portal, study_status))
if md5sum is None:
md5sum = ''
if filepath is None:
filepath = ''
if timestamp is None:
timestamp = ''

return md5sum, filepath, timestamp
102 changes: 101 additions & 1 deletion qiita_db/meta_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
from __future__ import division

from moi import r_client
from os import stat
from os import stat, makedirs, rename
from os.path import join, relpath, exists
from time import strftime, localtime
import matplotlib.pyplot as plt
import matplotlib as mpl
Expand All @@ -34,8 +35,11 @@
from StringIO import StringIO
from future.utils import viewitems
from datetime import datetime
from tarfile import open as topen, TarInfo
from hashlib import md5

from qiita_core.qiita_settings import qiita_config
from qiita_core.configuration_manager import ConfigurationManager
import qiita_db as qdb


Expand Down Expand Up @@ -332,3 +336,99 @@ def get_lat_longs():
qdb.sql_connection.TRN.add(sql)

return qdb.sql_connection.TRN.execute_fetchindex()


def generate_biom_and_metadata_release(study_status='public'):
"""Generate a list of biom/meatadata filepaths and a tgz of those files
Parameters
----------
study_status : str, optional
The study status to search for. Note that this should always be set
to 'public' but having this exposed helps with testing. The other
options are 'private' and 'sandbox'
"""
studies = qdb.study.Study.get_by_status(study_status)
qiita_config = ConfigurationManager()
working_dir = qiita_config.working_dir
portal = qiita_config.portal
bdir = qdb.util.get_db_files_base_dir()
time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

data = []
for s in studies:
# [0] latest is first, [1] only getting the filepath
sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)

for a in s.artifacts(artifact_type='BIOM'):
if a.processing_parameters is None:
continue

cmd_name = a.processing_parameters.command.name

# this loop is necessary as in theory an artifact can be
# generated from multiple prep info files
human_cmd = []
for p in a.parents:
pp = p.processing_parameters
pp_cmd_name = pp.command.name
if pp_cmd_name == 'Trimming':
human_cmd.append('%s @ %s' % (
cmd_name, str(pp.values['length'])))
else:
human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name))
human_cmd = ', '.join(human_cmd)

for _, fp, fp_type in a.filepaths:
if fp_type != 'biom' or 'only-16s' in fp:
continue
fp = relpath(fp, bdir)
# format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
# human readable name)
for pt in a.prep_templates:
for _, prep_fp in pt.get_filepaths():
if 'qiime' not in prep_fp:
break
prep_fp = relpath(prep_fp, bdir)
data.append((fp, sample_fp, prep_fp, a.id, human_cmd))

# writing text and tgz file
ts = datetime.now().strftime('%m%d%y-%H%M%S')
tgz_dir = join(working_dir, 'releases')
if not exists(tgz_dir):
makedirs(tgz_dir)
tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
txt_hd = StringIO()
with topen(tgz_name, "w|gz") as tgz:
# writing header for txt
txt_hd.write(
"biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n")
for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data:
txt_hd.write("%s\t%s\t%s\t%s\t%s\n" % (
biom_fp, sample_fp, prep_fp, artifact_id, human_cmd))
tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)

txt_hd.seek(0)
info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
info.size = len(txt_hd.buf)
tgz.addfile(tarinfo=info, fileobj=txt_hd)

with open(tgz_name, "rb") as f:
md5sum = md5()
for c in iter(lambda: f.read(4096), b""):
md5sum.update(c)

rename(tgz_name, tgz_name_final)

vals = [
('filepath', tgz_name_final[len(working_dir):], r_client.set),
('md5sum', md5sum.hexdigest(), r_client.set),
('time', time, r_client.set)]
for k, v, f in vals:
redis_key = '%s:release:%s:%s' % (portal, study_status, k)
# important to "flush" variables to avoid errors
r_client.delete(redis_key)
f(redis_key, v)
165 changes: 165 additions & 0 deletions qiita_db/test/test_meta_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

from unittest import TestCase, main
import numpy.testing as npt
from tarfile import open as topen
from os import remove
from os.path import exists, join

import pandas as pd

Expand All @@ -22,9 +25,13 @@
class MetaUtilTests(TestCase):
def setUp(self):
self.old_portal = qiita_config.portal
self.files_to_remove = []

def tearDown(self):
qiita_config.portal = self.old_portal
for fp in self.files_to_remove:
if exists(fp):
remove(fp)

def _set_artifact_private(self):
self.conn_handler.execute(
Expand Down Expand Up @@ -227,6 +234,164 @@ def test_update_redis_stats(self):
redis_key = '%s:stats:%s' % (portal, k)
self.assertEqual(f(redis_key), exp)

def test_generate_biom_and_metadata_release(self):
level = 'private'
qdb.meta_util.generate_biom_and_metadata_release(level)
portal = qiita_config.portal
working_dir = qiita_config.working_dir

vals = [
('filepath', r_client.get),
('md5sum', r_client.get),
('time', r_client.get)]
# we are storing the [0] filepath, [1] md5sum and [2] time but we are
# only going to check the filepath contents so ignoring the others
tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0]))
tgz = join(working_dir, tgz)

self.files_to_remove.extend([tgz])

tmp = topen(tgz, "r:gz")
tgz_obs = [ti.name for ti in tmp]
tmp.close()
# files names might change due to updates and patches so just check
# that the prefix exists.
fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom'
self.assertTrue(fn in tgz_obs)
tgz_obs.remove(fn)
# yes, this file is there twice
self.assertTrue(fn in tgz_obs)
tgz_obs.remove(fn)
# let's check the next biom
fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.'
'biom')
self.assertTrue(fn in tgz_obs)
tgz_obs.remove(fn)
# now let's check prep info files based on their suffix, just take
# the first one and check/rm the occurances of that file
fn_prep = [f for f in tgz_obs
if f.startswith('templates/1_prep_1_')][0]
# 3 times
self.assertTrue(fn_prep in tgz_obs)
tgz_obs.remove(fn_prep)
self.assertTrue(fn_prep in tgz_obs)
tgz_obs.remove(fn_prep)
self.assertTrue(fn_prep in tgz_obs)
tgz_obs.remove(fn_prep)
fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0]
# 3 times
self.assertTrue(fn_sample in tgz_obs)
tgz_obs.remove(fn_sample)
self.assertTrue(fn_sample in tgz_obs)
tgz_obs.remove(fn_sample)
self.assertTrue(fn_sample in tgz_obs)
tgz_obs.remove(fn_sample)
# now we should only have the text file
txt = tgz_obs.pop()
# now it should be empty
self.assertEqual(tgz_obs, [])

tmp = topen(tgz, "r:gz")
fhd = tmp.extractfile(txt)
txt_obs = fhd.readlines()
tmp.close()
txt_exp = [
'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
'%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n'
% (fn_sample, fn_prep),
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
'%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n'
% (fn_sample, fn_prep),
'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n'
% (fn_sample, fn_prep)]
self.assertEqual(txt_obs, txt_exp)

# whatever the configuration was, we will change to settings so we can
# test the other option when dealing with the end '/'
with qdb.sql_connection.TRN:
qdb.sql_connection.TRN.add(
"SELECT base_data_dir FROM settings")
obdr = qdb.sql_connection.TRN.execute_fetchlast()
if obdr[-1] == '/':
bdr = obdr[:-1]
else:
bdr = obdr + '/'

qdb.sql_connection.TRN.add(
"UPDATE settings SET base_data_dir = '%s'" % bdr)
bdr = qdb.sql_connection.TRN.execute()

qdb.meta_util.generate_biom_and_metadata_release(level)
# we are storing the [0] filepath, [1] md5sum and [2] time but we are
# only going to check the filepath contents so ignoring the others
tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0]))
tgz = join(working_dir, tgz)

tmp = topen(tgz, "r:gz")
tgz_obs = [ti.name for ti in tmp]
tmp.close()
# files names might change due to updates and patches so just check
# that the prefix exists.
fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom'
self.assertTrue(fn in tgz_obs)
tgz_obs.remove(fn)
# yes, this file is there twice
self.assertTrue(fn in tgz_obs)
tgz_obs.remove(fn)
# let's check the next biom
fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.'
'biom')
self.assertTrue(fn in tgz_obs)
tgz_obs.remove(fn)
# now let's check prep info files based on their suffix, just take
# the first one and check/rm the occurances of that file
fn_prep = [f for f in tgz_obs
if f.startswith('templates/1_prep_1_')][0]
# 3 times
self.assertTrue(fn_prep in tgz_obs)
tgz_obs.remove(fn_prep)
self.assertTrue(fn_prep in tgz_obs)
tgz_obs.remove(fn_prep)
self.assertTrue(fn_prep in tgz_obs)
tgz_obs.remove(fn_prep)
fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0]
# 3 times
self.assertTrue(fn_sample in tgz_obs)
tgz_obs.remove(fn_sample)
self.assertTrue(fn_sample in tgz_obs)
tgz_obs.remove(fn_sample)
self.assertTrue(fn_sample in tgz_obs)
tgz_obs.remove(fn_sample)
# now we should only have the text file
txt = tgz_obs.pop()
# now it should be empty
self.assertEqual(tgz_obs, [])

tmp = topen(tgz, "r:gz")
fhd = tmp.extractfile(txt)
txt_obs = fhd.readlines()
tmp.close()
txt_exp = [
'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
'%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n'
% (fn_sample, fn_prep),
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
'%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n'
% (fn_sample, fn_prep),
'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n'
% (fn_sample, fn_prep)]
self.assertEqual(txt_obs, txt_exp)

# returning configuration
with qdb.sql_connection.TRN:
qdb.sql_connection.TRN.add(
"UPDATE settings SET base_data_dir = '%s'" % obdr)
bdr = qdb.sql_connection.TRN.execute()


EXP_LAT_LONG = (
'[[60.1102854322, 74.7123248382], [23.1218032799, 42.838497795],'
Expand Down

0 comments on commit 129f0f6

Please sign in to comment.