From b0b4d02db75b938deb41ca9e46c774ba210c9d3c Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Thu, 23 Apr 2015 18:12:07 +0200 Subject: [PATCH 1/9] WIP: start refactoring vizier to use tsv/csv --- astroquery/vizier/core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index 3575b31ffa..7cf39121a4 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -110,6 +110,10 @@ def _server_to_url(self, return_type='votable'): FITS binary table: asu-binfits plain text: asu-txt """ + # Only votable is supported now, but in case we try to support + # something in the future we should disallow invalid ones. + assert return_type in ('votable', 'asu-tsv', 'asu-fits', + 'asu-binfits', 'asu-txt') return "http://" + self.VIZIER_SERVER + "/viz-bin/" + return_type @property From 6f7087e4dac5b210835b0491da8a0d03bdf74523 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Fri, 24 Apr 2015 11:10:37 +0200 Subject: [PATCH 2/9] get_query_payload --- astropy_helpers | 2 +- astroquery/vizier/core.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/astropy_helpers b/astropy_helpers index 5fd32d0edc..161773fa72 160000 --- a/astropy_helpers +++ b/astropy_helpers @@ -1 +1 @@ -Subproject commit 5fd32d0edc34f94de9640fd20865cfe5d605e499 +Subproject commit 161773fa72d916c498e0a2a513ecc24460244ac8 diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index 7cf39121a4..e4423aa53a 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -212,7 +212,7 @@ def get_catalogs_async(self, catalog, verbose=False): return response def query_object_async(self, object_name, catalog=None, radius=None, - coordinate_frame=None): + coordinate_frame=None, get_query_payload=False): """ Serves the same purpose as `query_object` but only returns the HTTP response rather than the parsed result. @@ -252,6 +252,8 @@ def query_object_async(self, object_name, catalog=None, radius=None, data_payload = self._args_to_payload( center=center, catalog=catalog) + if get_query_payload: + return data_payload response = self._request(method='POST', url=self._server_to_url(), data=data_payload, From ecba4b514eb347743eae3c73c0806a814747494a Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Sat, 25 Apr 2015 13:28:44 +0200 Subject: [PATCH 3/9] minor whitespace --- astroquery/vizier/core.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index e4423aa53a..d15672a549 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -176,8 +176,9 @@ def find_catalogs(self, keywords, include_obsolete=False, verbose=False, data=data_payload, timeout=self.TIMEOUT) if 'STOP, Max. number of RESOURCE reached' in response.text: - raise ValueError("Maximum number of catalogs exceeded. Try setting max_catalogs " - "to a large number and try again") + raise ValueError("Maximum number of catalogs exceeded. Try " + "setting max_catalogs to a large number and" + " try again") result = self._parse_result(response, verbose=verbose, get_catalog_names=True) # Filter out the obsolete catalogs, unless requested @@ -536,7 +537,8 @@ def _args_to_payload(self, *args, **kwargs): script += "\n" + str(self.keywords) return script - def _parse_result(self, response, get_catalog_names=False, verbose=False, invalid='warn'): + def _parse_result(self, response, get_catalog_names=False, verbose=False, + invalid='warn'): """ Parses the HTTP response to create a `~astropy.table.Table`. From 6bb8a25b45bff90203466c9587f7764275c44299 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Sat, 25 Apr 2015 13:39:42 +0200 Subject: [PATCH 4/9] add cache and return_type keywords --- astroquery/vizier/core.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index d15672a549..a77cfed55c 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -130,7 +130,7 @@ def keywords(self): self._keywords = None def find_catalogs(self, keywords, include_obsolete=False, verbose=False, - max_catalogs=None): + max_catalogs=None, return_type='votable'): """ Search Vizier for catalogs based on a set of keywords, e.g. author name @@ -172,7 +172,7 @@ def find_catalogs(self, keywords, include_obsolete=False, verbose=False, if max_catalogs is not None: data_payload['-meta.max'] = max_catalogs response = self._request(method='POST', - url=self._server_to_url(), + url=self._server_to_url(return_type=return_type), data=data_payload, timeout=self.TIMEOUT) if 'STOP, Max. number of RESOURCE reached' in response.text: @@ -190,7 +190,7 @@ def find_catalogs(self, keywords, include_obsolete=False, verbose=False, return result - def get_catalogs_async(self, catalog, verbose=False): + def get_catalogs_async(self, catalog, verbose=False, return_type='votable'): """ Query the Vizier service for a specific catalog @@ -207,13 +207,14 @@ def get_catalogs_async(self, catalog, verbose=False): data_payload = self._args_to_payload(catalog=catalog) response = self._request(method='POST', - url=self._server_to_url(), + url=self._server_to_url(return_type=return_type), data=data_payload, timeout=self.TIMEOUT) return response def query_object_async(self, object_name, catalog=None, radius=None, - coordinate_frame=None, get_query_payload=False): + coordinate_frame=None, get_query_payload=False, + return_type='votable', cache=True): """ Serves the same purpose as `query_object` but only returns the HTTP response rather than the parsed result. @@ -256,14 +257,15 @@ def query_object_async(self, object_name, catalog=None, radius=None, if get_query_payload: return data_payload response = self._request(method='POST', - url=self._server_to_url(), + url=self._server_to_url(return_type=return_type), data=data_payload, - timeout=self.TIMEOUT) + timeout=self.TIMEOUT, + cache=cache) return response def query_region_async(self, coordinates, radius=None, inner_radius=None, width=None, height=None, catalog=None, - get_query_payload=False): + get_query_payload=False, cache=True): """ Serves the same purpose as `query_region` but only returns the HTTP response rather than the parsed result. @@ -381,12 +383,15 @@ def query_region_async(self, coordinates, radius=None, inner_radius=None, return data_payload response = self._request(method='POST', - url=self._server_to_url(), + url=self._server_to_url(return_type=return_type), data=data_payload, - timeout=self.TIMEOUT) + timeout=self.TIMEOUT, + cache=cache) return response - def query_constraints_async(self, catalog=None, **kwargs): + def query_constraints_async(self, catalog=None, return_type='votable', + cache=True, + **kwargs): """ Send a query to Vizier in which you specify constraints with keyword/value pairs. @@ -444,9 +449,10 @@ def query_constraints_async(self, catalog=None, **kwargs): column_filters=kwargs, center={'-c.rd': 180}) response = self._request(method='POST', - url=self._server_to_url(), + url=self._server_to_url(return_type=return_type), data=data_payload, - timeout=self.TIMEOUT) + timeout=self.TIMEOUT, + cache=cache) return response def _args_to_payload(self, *args, **kwargs): From 568f252a687002d3b1bd1df2cf58e268d9462995 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Sat, 25 Apr 2015 15:24:25 +0200 Subject: [PATCH 5/9] add a tsv parser --- astroquery/vizier/core.py | 54 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index a77cfed55c..7fc1cf013d 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -5,6 +5,7 @@ import warnings import json import copy +import re from astropy.extern import six import astropy.units as u @@ -13,6 +14,7 @@ import astropy.utils.data as aud from astropy.utils import OrderedDict import astropy.io.votable as votable +from astropy.io import ascii from ..query import BaseQuery from ..utils import commons @@ -110,10 +112,42 @@ def _server_to_url(self, return_type='votable'): FITS binary table: asu-binfits plain text: asu-txt """ + + """ + Quasi-private performance tests: + %timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-tsv', cache=False) + 1 loops, best of 3: 7.11 s per loop + %timeit m83tsv = Vizier.query_object_async('M83', return_type='votable', cache=False) + 1 loops, best of 3: 6.79 s per loop + %timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-fits', cache=False) + 1 loops, best of 3: 6.21 s per loop + %timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-binfits', cache=False) + 1 loops, best of 3: 667 ms per loop + Looks like this one led to a segfault on their system? + + %timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-txt', cache=False) + 1 loops, best of 3: 6.83 s per loop + %timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-tsv', cache=False) + 1 loops, best of 3: 6.8 s per loop + + m83tsv = Vizier.query_object_async('M83', return_type='asu-tsv', cache=False) + m83votable = Vizier.query_object_async('M83', return_type='votable', cache=False) + m83fits = Vizier.query_object_async('M83', return_type='asu-fits', cache=False) + m83txt = Vizier.query_object_async('M83', return_type='asu-txt', cache=False) + #m83binfits = Vizier.query_object_async('M83', return_type='asu-binfits', cache=False) + """ # Only votable is supported now, but in case we try to support # something in the future we should disallow invalid ones. assert return_type in ('votable', 'asu-tsv', 'asu-fits', 'asu-binfits', 'asu-txt') + if return_type in ('asu-txt',): + # I had a look at the format of these "tables" and... they just + # aren't. They're quasi-fixed-width without schema. I think they + # follow the general philosophy of "consistency is overrated" + # The CDS reader chokes on it. + raise TypeError("asu-txt is not and cannot be supported: the " + "returned tables are not and cannot be made " + "parseable.") return "http://" + self.VIZIER_SERVER + "/viz-bin/" + return_type @property @@ -625,6 +659,26 @@ def valid_keywords(self): return self._valid_keyword_dict +def parse_vizier_tsvfile(data): + """ + Parse a Vizier-generated list of tsv data tables into a list of astropy + Tables. + + Parameters + ---------- + data : ascii str + An ascii string containing the vizier-formatted list of tables + """ + + # http://stackoverflow.com/questions/4664850/find-all-occurrences-of-a-substring-in-python + split_indices = [m.start() for m in re.finditer('\n\n#', data)] + # we want to slice out chunks of the file each time + split_limits = zip(split_indices[:-1], split_indices[1:]) + tables = [ascii.read(BytesIO(data[a:b]), format='tab', delimiter='\t', + header_start=0, comment="#") for + a,b in split_limits] + return tables + def _parse_angle(angle): """ From 6d2554befea3c24bd1591cf138aefb32fd461c5d Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Sat, 25 Apr 2015 15:46:54 +0200 Subject: [PATCH 6/9] factor out table parsers into tsv and votable --- astroquery/vizier/core.py | 98 +++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index 7fc1cf013d..fa8795b28d 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -601,51 +601,18 @@ def _parse_result(self, response, get_catalog_names=False, verbose=False, table_list : `astroquery.utils.TableList` or str If there are errors in the parsing, then returns the raw results as a string. """ - if not verbose: - commons.suppress_vo_warnings() - try: - tf = six.BytesIO(response.content) - - if invalid == 'mask': - vo_tree = votable.parse(tf, pedantic=False, invalid='mask') - elif invalid == 'warn': - try: - vo_tree = votable.parse(tf, pedantic=False, invalid='raise') - except Exception as ex: - warnings.warn("VOTABLE parsing raised exception: {0}".format(ex)) - vo_tree = votable.parse(tf, pedantic=False, invalid='mask') - elif invalid == 'raise': - vo_tree = votable.parse(tf, pedantic=False, invalid='raise') - else: - raise ValueError("Invalid keyword 'invalid'. Must be raise, mask, or warn") - - if get_catalog_names: - return dict([(R.name, R) for R in vo_tree.resources]) - else: - table_dict = OrderedDict() - for t in vo_tree.iter_tables(): - if len(t.array) > 0: - if t.ref is not None: - name = vo_tree.get_table_by_id(t.ref).name - else: - name = t.name - if name not in table_dict.keys(): - table_dict[name] = [] - table_dict[name] += [t.to_table()] - for name in table_dict.keys(): - if len(table_dict[name]) > 1: - table_dict[name] = tbl.vstack(table_dict[name]) - else: - table_dict[name] = table_dict[name][0] - return commons.TableList(table_dict) - - except Exception as ex: - self.response = response - self.table_parse_error = ex - raise TableParseError("Failed to parse VIZIER result! The raw response can be found " - "in self.response, and the error in self.table_parse_error." - " The attempted parsed result is in self.parsed_result.\n" - "Exception: " + str(self.table_parse_error)) + if response.content[:5] == ' 0: + if t.ref is not None: + name = vo_tree.get_table_by_id(t.ref).name + else: + name = t.name + if name not in table_dict.keys(): + table_dict[name] = [] + table_dict[name] += [t.to_table()] + for name in table_dict.keys(): + if len(table_dict[name]) > 1: + table_dict[name] = tbl.vstack(table_dict[name]) + else: + table_dict[name] = table_dict[name][0] + return commons.TableList(table_dict) + def _parse_angle(angle): """ From 7a76890c9bba8e5ef5dfd92a12f0081f88243f7c Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Sat, 25 Apr 2015 16:01:37 +0200 Subject: [PATCH 7/9] fix votable parsing and add some performance notes --- astroquery/vizier/core.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index fa8795b28d..14e4be55fc 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -115,6 +115,7 @@ def _server_to_url(self, return_type='votable'): """ Quasi-private performance tests: + It seems that these are dominated by table parsing time. %timeit m83tsv = Vizier.query_object_async('M83', return_type='asu-tsv', cache=False) 1 loops, best of 3: 7.11 s per loop %timeit m83tsv = Vizier.query_object_async('M83', return_type='votable', cache=False) @@ -135,6 +136,17 @@ def _server_to_url(self, return_type='votable'): m83fits = Vizier.query_object_async('M83', return_type='asu-fits', cache=False) m83txt = Vizier.query_object_async('M83', return_type='asu-txt', cache=False) #m83binfits = Vizier.query_object_async('M83', return_type='asu-binfits', cache=False) + + # many of these are invalid tables + %timeit fitstbls = fits.open(BytesIO(m83fits.content), ignore_missing_end=True) + 1 loops, best of 3: 541 ms per loop + + %timeit tbls = parse_vizier_tsvfile(m83tsv.content) + 1 loops, best of 3: 1.35 s per loop + + %timeit votbls = parse_vizier_votable(m83votable.content) + 1 loops, best of 3: 3.62 s per loop + """ # Only votable is supported now, but in case we try to support # something in the future we should disallow invalid ones. @@ -589,9 +601,11 @@ def _parse_result(self, response, get_catalog_names=False, verbose=False, response : `requests.Response` The response of the HTTP POST request get_catalog_names : bool + (only for VOTABLE queries) If specified, return only the table names (useful for table - discovery) + discovery). invalid : 'warn', 'mask' or 'raise' + (only for VOTABLE queries) The behavior if a VOTABLE cannot be parsed. Default is 'warn', which will try to parse the table, then if an exception is raised, it will be printent but the masked table will be returned @@ -603,7 +617,9 @@ def _parse_result(self, response, get_catalog_names=False, verbose=False, """ if response.content[:5] == ' Date: Sat, 25 Apr 2015 16:03:11 +0200 Subject: [PATCH 8/9] fix a missing return_type kwd --- astroquery/vizier/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index 14e4be55fc..9a45935bbf 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -8,6 +8,7 @@ import re from astropy.extern import six +from astropy.extern.six import BytesIO import astropy.units as u import astropy.coordinates as coord import astropy.table as tbl @@ -311,7 +312,8 @@ def query_object_async(self, object_name, catalog=None, radius=None, def query_region_async(self, coordinates, radius=None, inner_radius=None, width=None, height=None, catalog=None, - get_query_payload=False, cache=True): + get_query_payload=False, cache=True, + return_type='votable'): """ Serves the same purpose as `query_region` but only returns the HTTP response rather than the parsed result. From f0aae4b222216c7c08ddb38ef5965c0d1278cecc Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Sat, 25 Apr 2015 16:17:54 +0200 Subject: [PATCH 9/9] python3: can't compare bytes and strings any more --- astroquery/vizier/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/astroquery/vizier/core.py b/astroquery/vizier/core.py index 9a45935bbf..6732c7c36e 100644 --- a/astroquery/vizier/core.py +++ b/astroquery/vizier/core.py @@ -617,7 +617,7 @@ def _parse_result(self, response, get_catalog_names=False, verbose=False, table_list : `astroquery.utils.TableList` or str If there are errors in the parsing, then returns the raw results as a string. """ - if response.content[:5] == '