Merge 2ff1213 into 64b0f1d

boutiques · Aug 27, 2020 · f9a6d18 · f9a6d18
2 parents 64b0f1d + 2ff1213
commit f9a6d18
Show file tree

Hide file tree

Showing 11 changed files with 517 additions and 239 deletions.
diff --git a/boutiques/bosh.py b/boutiques/bosh.py
@@ -389,23 +389,28 @@ def data(*params):
         parser.parse_known_args(params + ('--help',))
         raise_error(DataHandlerError,
                     "Missing data mode {delete, inspect, publish}.")
-    elif results.mode == "inspect":
-        from boutiques.dataHandler import DataHandler
-        dataHandler = DataHandler()
-        return dataHandler.inspect(results.example)
-    elif results.mode == "publish":
-        from boutiques.dataHandler import DataHandler
-        dataHandler = DataHandler()
-        return dataHandler.publish(results.file, results.zenodo_token,
-                                   results.author, results.nexus_token,
-                                   results.nexus_org, results.nexus_project,
-                                   results.individually, results.sandbox,
-                                   results.no_int, results.verbose,
-                                   results.nexus)
-    elif results.mode == "delete":
+    else:
         from boutiques.dataHandler import DataHandler
         dataHandler = DataHandler()
-        return dataHandler.delete(results.file, results.no_int)
+
+        if results.mode == "inspect":
+            return dataHandler.inspect(results.example)
+        elif results.mode == "publish":
+            return dataHandler.publish(results.file, results.zenodo_token,
+                                       results.author, results.nexus_token,
+                                       results.nexus_org, results.nexus_project,
+                                       results.individually, results.sandbox,
+                                       results.no_int, results.verbose,
+                                       results.nexus)
+        elif results.mode == "delete":
+            return dataHandler.delete(results.file, results.no_int)
+
+        elif results.mode == "search":
+            return dataHandler.search(results.verbose, results.sandbox)
+
+        elif results.mode == "pull":
+            return dataHandler.pull(results.zids, results.verbose,
+                                    results.sandbox)
 
 
 def deprecate(*params):
@@ -487,6 +492,9 @@ def bosh_return(val, code=0, hide=False, formatted=None):
             return bosh_return(out, hide=True)
         elif func == "data":
             out = data(*params)
+            if params.__contains__("search"):
+                return bosh_return(out, formatted=tabulate(out, headers='keys',
+                                                           tablefmt='plain'))
             return bosh_return(out)
         elif func == "version":
             from boutiques.__version__ import VERSION
@@ -506,7 +514,8 @@ def bosh_return(val, code=0, hide=False, formatted=None):
             ValidationError,
             ExportError,
             ImportError,
-            ExecutorError) as e:
+            ExecutorError,
+            DataHandlerError) as e:
         # We don't want to raise an exception when function is called
         # from CLI.'
         if runs_as_cli():

diff --git a/boutiques/boshParsers.py b/boutiques/boshParsers.py
@@ -60,15 +60,17 @@ def add_subparser_create(subparsers):
 
 def add_subparser_data(subparsers):
     parser_data = subparsers.add_parser(
-        "data", description="Manage execution data collection.")
+        "data", description="Manage execution data collection.",
+        formatter_class=RawTextHelpFormatter)
     parser_data.set_defaults(function='data')
     data_subparsers = parser_data.add_subparsers(
-        help="Manage execution data records. Inspect: displays "
-        "the unpublished records currently in the cache. "
+        help="Delete: remove one or more records from the cache.\n"
+        "Inspect: displays the unpublished records currently in the cache.\n"
         "Publish: publishes contents of cache to Zenodo as "
         "a public data set. Requires a Zenodo access token, "
-        "see http://developers.zenodo.org/#authentication. "
-        "Delete: remove one or more records from the cache.")
+        "see http://developers.zenodo.org/#authentication.\n"
+        "Pull: pull one or more execution data records from Zenodo.\n"
+        "Search: search for published execution data records on Zenodo.\n")
 
     parser_data_delete = data_subparsers.add_parser(
         "delete", description="Delete data record(s) in cache.")
@@ -134,6 +136,32 @@ def add_subparser_data(subparsers):
     parser_data_publish.add_argument("--nexus-project", action="store",
                                      help="Nexus project to publish to. ")
 
+    parser_data_pull = data_subparsers.add_parser(
+        "pull", description="Ensures that execution data records from Zenodo"
+                            "are locally cached, downloading them if needed.")
+    parser_data_pull.set_defaults(mode='pull')
+    parser_data_pull.add_argument("zids", nargs="+", action="store",
+                                  help="One or more Zenodo IDs for the excution"
+                                  " record(s) to pull, prefixed by 'zenodo.',"
+                                  " e.g. zenodo.123456 zenodo.123457")
+    parser_data_pull.add_argument("-v", "--verbose", action="store_true",
+                                  help="Print information messages")
+    parser_data_pull.add_argument("--sandbox", action="store_true",
+                                  help="pull from Zenodo's sandbox instead of "
+                                  "production server. Recommended for tests.")
+
+    parser_data_search = data_subparsers.add_parser(
+        "search", description="Search on Zenodo for"
+                              " execution data records. When no term is"
+                              " supplied, will search for all execution"
+                              " data records.")
+    parser_data_search.set_defaults(mode='search')
+    parser_data_search.add_argument("-v", "--verbose", action="store_true",
+                                    help="Print information messages")
+    parser_data_search.add_argument("--sandbox", action="store_true",
+                                    help="search Zenodo's sandbox instead of "
+                                    "production server. Recommended for tests.")
+
 
 def add_subparser_deprecate(subparsers):
     parser_deprecate = subparsers.add_parser(

diff --git a/boutiques/dataHandler.py b/boutiques/dataHandler.py
@@ -36,13 +36,17 @@ def inspect(self, example=False):
             else:
                 print("No records in the cache at the moment.")
         # Print information about files in cache
+        # and the directory of caching data
         else:
             print("There are {} unpublished records in the cache"
                   .format(len(self.record_files)))
             print("There are {} unpublished descriptors in the cache"
                   .format(len(self.descriptor_files)))
             for i in range(len(self.cache_files)):
                 print(self.cache_files[i])
+            print("Execution records are stored in: " +
+                  os.path.join(os.path.expanduser('~'),
+                               ".cache", "boutiques", "data"))
 
     # Private function to print a file to console
     def _display_file(self, file_path):
@@ -82,10 +86,7 @@ def publish(self, file, zenodo_token, author, nexus_token,
         # Verify publishing
         if not self.no_int:
             prompt = self._get_publishing_prompt()
-            try:
-                ret = raw_input(prompt)  # Python 2
-            except NameError:
-                ret = input(prompt)  # Python 3
+            ret = input(prompt)
             if ret.upper() != "Y":
                 return
 
@@ -199,6 +200,7 @@ def _create_metadata(self, records_dict):
         # Add tool name(s) to keywords
         data['metadata']['keywords'] = [v for v in unique_names]
         data['metadata']['keywords'].insert(0, 'Boutiques')
+        data['metadata']['keywords'].insert(1, 'Boutiques-execution-record')
         # Add descriptor link(s) to related identifiers
         data['metadata']['related_identifiers'] = \
             [{'identifier': url.format(v.split('.')[2]),
@@ -250,10 +252,7 @@ def delete(self, file=None, no_int=False):
         # Verify deletion
         if not self.no_int:
             prompt = self._get_delete_prompt()
-            try:
-                ret = raw_input(prompt)  # Python 2
-            except NameError:
-                ret = input(prompt)  # Python 3
+            ret = input(prompt)
             if ret.upper() != "Y":
                 return
 
@@ -272,6 +271,31 @@ def delete(self, file=None, no_int=False):
              for f in self.cache_files]
             print_info("All files have been removed from the data cache")
 
+    def search(self, verbose=False, sandbox=False):
+        firstKeyWord = "Boutiques"
+        secondKeyWord = "boutiques-execution-record"
+        searchType = "dataset"
+        query = ''
+        query_line = ''
+
+        from boutiques.zenodoHelper import ZenodoHelper
+        zenodoHelper = ZenodoHelper(verbose=verbose, sandbox=sandbox)
+
+        return zenodoHelper.search(query, query_line, firstKeyWord,
+                                   secondKeyWord, searchType)
+
+    def pull(self, zids, verbose=False, sandbox=False):
+        dataPull = True
+        firstKeyWord = "Boutiques"
+        secondKeyWord = "boutiques-execution-record"
+        searchType = "dataset"
+
+        from boutiques.zenodoHelper import ZenodoHelper
+        zenodoHelper = ZenodoHelper(verbose=verbose, sandbox=sandbox)
+
+        return zenodoHelper.zenodo_pull(zids, firstKeyWord,
+                                        secondKeyWord, searchType, dataPull)
+
     def _file_exists_in_cache(self, filename):
         file_path = os.path.join(self.cache_dir, filename)
         # Incorrect filename input

diff --git a/boutiques/publisher.py b/boutiques/publisher.py
@@ -93,8 +93,11 @@ def publish(self):
             from boutiques.searcher import Searcher
             searcher = Searcher(self.descriptor.get("name"), self.verbose,
                                 self.sandbox, exact_match=True)
-            r = self.zenodo_helper.zenodo_search(searcher.query,
-                                                 searcher.query_line)
+            zenodoHelper = ZenodoHelper(sandbox=self.sandbox,
+                                        verbose=self.verbose)
+            r = zenodoHelper.zenodo_search(searcher.query, searcher.query_line,
+                                           "Boutiques", "schema-version.*",
+                                           "software")
 
             publish_update = False
             for hit in r.json()["hits"]["hits"]:

diff --git a/boutiques/puller.py b/boutiques/puller.py
@@ -1,90 +1,22 @@
-import requests
-import urllib
-import os
-from boutiques.logger import raise_error, print_info
-from boutiques.searcher import Searcher
-from boutiques.zenodoHelper import ZenodoError, ZenodoHelper
-
-try:
-    # Python 3
-    from urllib.request import urlopen
-    from urllib.request import urlretrieve
-except ImportError:
-    # Python 2
-    from urllib2 import urlopen
-    from urllib import urlretrieve
+from boutiques.zenodoHelper import ZenodoError
+from urllib.request import urlopen
+from urllib.request import urlretrieve
 
 
 class Puller():
 
     def __init__(self, zids, verbose=False, sandbox=False):
-        # remove zenodo prefix
-        self.zenodo_entries = []
-        self.cache_dir = os.path.join(
-            os.path.expanduser('~'), ".cache", "boutiques",
-            "sandbox" if sandbox else "production")
-        discarded_zids = zids
-        # This removes duplicates, should maintain order
-        zids = list(dict.fromkeys(zids))
-        for zid in zids:
-            discarded_zids.remove(zid)
-            try:
-                # Zenodo returns the full DOI, but for the purposes of
-                # Boutiques we just use the Zenodo-specific portion (as its the
-                # unique part). If the API updates on Zenodo to no longer
-                # provide the full DOI, this still works because it just grabs
-                # the last thing after the split.
-                zid = zid.split('/')[-1]
-                newzid = zid.split(".", 1)[1]
-                newfname = os.path.join(self.cache_dir,
-                                        "zenodo-{0}.json".format(newzid))
-                self.zenodo_entries.append({"zid": newzid, "fname": newfname})
-            except IndexError:
-                raise_error(ZenodoError, "Zenodo ID must be prefixed by "
-                            "'zenodo', e.g. zenodo.123456")
         self.verbose = verbose
         self.sandbox = sandbox
-        if(self.verbose):
-            for zid in discarded_zids:
-                print_info("Discarded duplicate id {0}".format(zid))
-        self.zenodo_helper = ZenodoHelper(sandbox=self.sandbox,
-                                          verbose=self.verbose)
+        self.zids = zids
 
     def pull(self):
-        # return cached file if it exists
-        json_files = []
-        for entry in self.zenodo_entries:
-            if os.path.isfile(entry["fname"]):
-                if(self.verbose):
-                    print_info("Found cached file at %s"
-                               % entry["fname"])
-                json_files.append(entry["fname"])
-                continue
-
-            searcher = Searcher(entry["zid"], self.verbose, self.sandbox,
-                                exact_match=True)
-            r = self.zenodo_helper.zenodo_search(searcher.query,
-                                                 searcher.query_line)
-            if not len(r.json()["hits"]["hits"]):
-                raise_error(ZenodoError, "Descriptor \"{0}\" "
-                            "not found".format(entry["zid"]))
-            for hit in r.json()["hits"]["hits"]:
-                file_path = hit["files"][0]["links"]["self"]
-                file_name = file_path.split(os.sep)[-1]
-                if hit["id"] == int(entry["zid"]):
-                    if not os.path.exists(self.cache_dir):
-                        os.makedirs(self.cache_dir)
-                    if(self.verbose):
-                        print_info("Downloading descriptor %s"
-                                   % file_name)
-                    downloaded = urlretrieve(file_path, entry["fname"])
-                    if(self.verbose):
-                        print_info("Downloaded descriptor to "
-                                   + downloaded[0])
-                    json_files.append(downloaded[0])
-                else:
-                    raise_error(ZenodoError, "Searched-for descriptor \"{0}\" "
-                                "does not match descriptor \"{1}\" returned "
-                                "from Zenodo".format(entry["zid"], hit["id"]))
+        dataPull = False
+        firstKeyWord = "Boutiques"
+        secondKeyWord = "schema-version.*"
+        searchType = "software"
+        from boutiques.zenodoHelper import ZenodoHelper
+        zenodoHelper = ZenodoHelper(verbose=self.verbose, sandbox=self.sandbox)
 
-        return json_files
+        return zenodoHelper.zenodo_pull(self.zids, firstKeyWord,
+                                        secondKeyWord, searchType, dataPull)