Skip to content

Commit

Permalink
Automatic jobs & new stats (#2057)
Browse files Browse the repository at this point in the history
* fix #814, fix #1636

* fixing error in test-env

* fixing stats.html call

* adding img

* addressing @josenavas comments

* rm for loops

* addresssing @ElDeveloper comments
  • Loading branch information
antgonza authored and ElDeveloper committed Jan 27, 2017
1 parent 9eb9dbb commit 19889f9
Show file tree
Hide file tree
Showing 8 changed files with 379 additions and 87 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ script:
- qiita-env start_cluster qiita-general
- qiita-env make --no-load-ontologies
- if [ ${TEST_ADD_STUDIES} == "True" ]; then test_data_studies/commands.sh ; fi
- if [ ${TEST_ADD_STUDIES} == "True" ]; then qiita-cron-job ; fi
- if [ ${TEST_ADD_STUDIES} == "False" ]; then qiita-test-install ; fi
- if [ ${TEST_ADD_STUDIES} == "False" ]; then nosetests --with-doctest --with-coverage -v --cover-package=qiita_db,qiita_pet,qiita_core,qiita_ware; fi
- flake8 qiita_* setup.py scripts/qiita scripts/qiita-env scripts/qiita-test-install
- flake8 qiita_* setup.py scripts/*
- ls -R /home/travis/miniconda3/envs/qiita/lib/python2.7/site-packages/qiita_pet/support_files/doc/
- qiita pet webserver
addons:
Expand Down
156 changes: 155 additions & 1 deletion qiita_db/meta_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@
# -----------------------------------------------------------------------------
from __future__ import division

from moi import r_client
from os import stat
from time import strftime, localtime
import matplotlib.pyplot as plt
import matplotlib as mpl
from base64 import b64encode
from urllib import quote
from StringIO import StringIO
from future.utils import viewitems
from datetime import datetime

from qiita_core.qiita_settings import qiita_config
import qiita_db as qdb

Expand Down Expand Up @@ -122,6 +133,147 @@ def get_accessible_filepath_ids(user):
return filepath_ids


def update_redis_stats():
"""Generate the system stats and save them in redis
Returns
-------
list of str
artifact filepaths that are not present in the file system
"""
STUDY = qdb.study.Study
studies = {'public': STUDY.get_by_status('private'),
'private': STUDY.get_by_status('public'),
'sanbox': STUDY.get_by_status('sandbox')}
number_studies = {k: len(v) for k, v in viewitems(studies)}

number_of_samples = {}
ebi_samples_prep = {}
num_samples_ebi = 0
for k, sts in viewitems(studies):
number_of_samples[k] = 0
for s in sts:
st = s.sample_template
if st is not None:
number_of_samples[k] += len(list(st.keys()))

ebi_samples_prep_count = 0
for pt in s.prep_templates():
ebi_samples_prep_count += len([
1 for _, v in viewitems(pt.ebi_experiment_accessions)
if v is not None and v != ''])
ebi_samples_prep[s.id] = ebi_samples_prep_count

if s.sample_template is not None:
num_samples_ebi += len([
1 for _, v in viewitems(
s.sample_template.ebi_sample_accessions)
if v is not None and v != ''])

num_users = qdb.util.get_count('qiita.qiita_user')

lat_longs = get_lat_longs()

num_studies_ebi = len(ebi_samples_prep)
number_samples_ebi_prep = sum([v for _, v in viewitems(ebi_samples_prep)])

# generating file size stats
stats = []
missing_files = []
for k, sts in viewitems(studies):
for s in sts:
for a in s.artifacts():
for _, fp, dt in a.filepaths:
try:
s = stat(fp)
stats.append((dt, s.st_size, strftime('%Y-%m',
localtime(s.st_ctime))))
except OSError:
missing_files.append(fp)

summary = {}
all_dates = []
for ft, size, ym in stats:
if ft not in summary:
summary[ft] = {}
if ym not in summary[ft]:
summary[ft][ym] = 0
all_dates.append(ym)
summary[ft][ym] += size
all_dates = sorted(set(all_dates))

# sorting summaries
rm_from_data = ['html_summary', 'tgz', 'directory', 'raw_fasta', 'log',
'biom', 'raw_sff', 'raw_qual']
ordered_summary = {}
for dt in summary:
if dt in rm_from_data:
continue
new_list = []
current_value = 0
for ad in all_dates:
if ad in summary[dt]:
current_value += summary[dt][ad]
new_list.append(current_value)
ordered_summary[dt] = new_list

plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary],
key=lambda x: x[1])

# helper function to generate y axis, modified from:
# http://stackoverflow.com/a/1094933
def sizeof_fmt(value, position):
number = None
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
if abs(value) < 1024.0:
number = "%3.1f%s" % (value, unit)
break
value /= 1024.0
if number is None:
number = "%.1f%s" % (value, 'Yi')
return number

all_dates_axis = range(len(all_dates))
plt.locator_params(axis='y', nbins=10)
plt.figure(figsize=(20, 10))
for k, v in plot_order:
plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k)

plt.xticks(all_dates_axis, all_dates)
plt.legend()
plt.grid()
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt))
plt.xlabel('Date')
plt.ylabel('Storage space per data type')

plot = StringIO()
plt.savefig(plot, format='png')
plot.seek(0)
img = 'data:image/png;base64,' + quote(b64encode(plot.buf))

time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

portal = qiita_config.portal
vals = [
('number_studies', number_studies, r_client.hmset),
('number_of_samples', number_of_samples, r_client.hmset),
('num_users', num_users, r_client.set),
('lat_longs', lat_longs, r_client.set),
('num_studies_ebi', num_studies_ebi, r_client.set),
('num_samples_ebi', num_samples_ebi, r_client.set),
('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set),
('img', img, r_client.set),
('time', time, r_client.set)]
for k, v, f in vals:
redis_key = '%s:stats:%s' % (portal, k)
# important to "flush" variables to avoid errors
r_client.delete(redis_key)
f(redis_key, v)

return missing_files


def get_lat_longs():
"""Retrieve the latitude and longitude of all the samples in the DB
Expand All @@ -146,7 +298,9 @@ def get_lat_longs():
sql = [('SELECT CAST(latitude AS FLOAT), '
' CAST(longitude AS FLOAT) '
'FROM qiita.%s '
'WHERE isnumeric(latitude) AND isnumeric(latitude)' % s)
'WHERE isnumeric(latitude) AND isnumeric(longitude) '
"AND latitude <> 'NaN' "
"AND longitude <> 'NaN' " % s)
for s in qdb.sql_connection.TRN.execute_fetchflatten()]
sql = ' UNION '.join(sql)
qdb.sql_connection.TRN.add(sql)
Expand Down
38 changes: 38 additions & 0 deletions qiita_db/test/test_meta_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import pandas as pd

from moi import r_client
from qiita_core.qiita_settings import qiita_config
from qiita_core.util import qiita_test_checker

Expand Down Expand Up @@ -180,6 +181,43 @@ def test_get_lat_longs_EMP_portal(self):

self.assertItemsEqual(obs, exp)

def test_update_redis_stats(self):
qdb.meta_util.update_redis_stats()

portal = qiita_config.portal
vals = [
('number_studies', {'sanbox': '2', 'public': '0',
'private': '1'}, r_client.hgetall),
('number_of_samples', {'sanbox': '1', 'public': '0',
'private': '27'}, r_client.hgetall),
('num_users', '4', r_client.get),
('lat_longs', EXP_LAT_LONG, r_client.get),
('num_studies_ebi', '3', r_client.get),
('num_samples_ebi', '27', r_client.get),
('number_samples_ebi_prep', '54', r_client.get)
# not testing img/time for simplicity
# ('img', r_client.get),
# ('time', r_client.get)
]
for k, exp, f in vals:
redis_key = '%s:stats:%s' % (portal, k)
self.assertEqual(f(redis_key), exp)


EXP_LAT_LONG = (
'[[0.291867635913, 68.5945325743], [68.0991287718, 34.8360987059],'
' [10.6655599093, 70.784770579], [40.8623799474, 6.66444220187],'
' [13.089194595, 92.5274472082], [84.0030227585, 66.8954849864],'
' [12.7065957714, 84.9722975792], [78.3634273709, 74.423907894],'
' [82.8302905615, 86.3615778099], [53.5050692395, 31.6056761814],'
' [43.9614715197, 82.8516734159], [29.1499460692, 82.1270418227],'
' [23.1218032799, 42.838497795], [12.6245524972, 96.0693176066],'
' [38.2627021402, 3.48274264219], [74.0894932572, 65.3283470202],'
' [35.2374368957, 68.5041623253], [4.59216095574, 63.5115213108],'
' [95.2060749748, 27.3592668624], [68.51099627, 2.35063674718],'
' [85.4121476399, 15.6526750776], [60.1102854322, 74.7123248382],'
' [3.21190859967, 26.8138925876], [57.571893782, 32.5563076447],'
' [44.9725384282, 66.1920014699], [42.42, 41.41]]')

if __name__ == '__main__':
main()
16 changes: 15 additions & 1 deletion qiita_db/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from unittest import TestCase, main
from tempfile import mkstemp
from os import close, remove
from os import close, remove, mkdir
from os.path import join, exists, basename
from shutil import rmtree
from datetime import datetime
Expand Down Expand Up @@ -365,6 +365,20 @@ def _common_purge_filpeaths_test(self):
def test_purge_filepaths(self):
self._common_purge_filpeaths_test()

def test_empty_trash_upload_folder(self):
# creating file to delete so we know it actually works
study_id = '1'
uploads_fp = join(qdb.util.get_mountpoint("uploads")[0][1], study_id)
trash = join(uploads_fp, 'trash')
if not exists(trash):
mkdir(trash)
fp = join(trash, 'my_file_to_delete.txt')
open(fp, 'w').close()

self.assertTrue(exists(fp))
qdb.util.empty_trash_upload_folder()
self.assertFalse(exists(fp))

def test_purge_filepaths_null_cols(self):
# For more details about the source of the issue that motivates this
# test: http://www.depesz.com/2008/08/13/nulls-vs-not-in/
Expand Down
79 changes: 61 additions & 18 deletions qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,9 +714,24 @@ def path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id):
for fpid, fp, fp_type_, m, s in results]


def purge_filepaths():
def _rm_files(TRN, fp):
# Remove the data
if exists(fp):
if isdir(fp):
func = rmtree
else:
func = remove
TRN.add_post_commit_func(func, fp)


def purge_filepaths(delete_files=True):
r"""Goes over the filepath table and remove all the filepaths that are not
used in any place
Parameters
----------
delete_files : bool
if True it will actually delete the files, if False print
"""
with qdb.sql_connection.TRN:
# Get all the (table, column) pairs that reference to the filepath
Expand All @@ -739,30 +754,58 @@ def purge_filepaths():
union_str = " UNION ".join(
["SELECT %s FROM qiita.%s WHERE %s IS NOT NULL" % (col, table, col)
for table, col in qdb.sql_connection.TRN.execute_fetchindex()])
# Get all the filepaths from the filepath table that are not
# referenced from any place in the database
sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id
FROM qiita.filepath FP JOIN qiita.filepath_type FPT
ON FP.filepath_type_id = FPT.filepath_type_id
WHERE filepath_id NOT IN (%s)""" % union_str
qdb.sql_connection.TRN.add(sql)
if union_str:
# Get all the filepaths from the filepath table that are not
# referenced from any place in the database
sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id
FROM qiita.filepath FP JOIN qiita.filepath_type FPT
ON FP.filepath_type_id = FPT.filepath_type_id
WHERE filepath_id NOT IN (%s)""" % union_str
qdb.sql_connection.TRN.add(sql)

# We can now go over and remove all the filepaths
sql = "DELETE FROM qiita.filepath WHERE filepath_id=%s"
db_results = qdb.sql_connection.TRN.execute_fetchindex()
for fp_id, fp, fp_type, dd_id in db_results:
qdb.sql_connection.TRN.add(sql, [fp_id])
if delete_files:
qdb.sql_connection.TRN.add(sql, [fp_id])
fp = join(get_mountpoint_path_by_id(dd_id), fp)
_rm_files(qdb.sql_connection.TRN, fp)
else:
print fp, fp_type

# Remove the data
fp = join(get_mountpoint_path_by_id(dd_id), fp)
if exists(fp):
if fp_type is 'directory':
func = rmtree
else:
func = remove
qdb.sql_connection.TRN.add_post_commit_func(func, fp)
if delete_files:
qdb.sql_connection.TRN.execute()

qdb.sql_connection.TRN.execute()

def empty_trash_upload_folder(delete_files=True):
r"""Delete all files in the trash folder inside each of the upload
folders
Parameters
----------
delete_files : bool
if True it will actually delete the files, if False print
"""
gfp = partial(join, get_db_files_base_dir())
with qdb.sql_connection.TRN:
sql = """SELECT mountpoint
FROM qiita.data_directory
WHERE data_type = 'uploads'"""
qdb.sql_connection.TRN.add(sql)

for mp in qdb.sql_connection.TRN.execute_fetchflatten():
for path, dirs, files in walk(gfp(mp)):
if path.endswith('/trash'):
if delete_files:
for f in files:
fp = join(path, f)
_rm_files(qdb.sql_connection.TRN, fp)
else:
print files

if delete_files:
qdb.sql_connection.TRN.execute()


def move_filepaths_to_upload_folder(study_id, filepaths):
Expand Down
Loading

0 comments on commit 19889f9

Please sign in to comment.