Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[API] Draw random objects #306

Merged
merged 19 commits into from
Jun 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
80 changes: 79 additions & 1 deletion apps/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@

from apps.api.doc import api_doc_summary, api_doc_object, api_doc_explorer
from apps.api.doc import api_doc_latests, api_doc_sso, api_doc_tracklets
from apps.api.doc import api_doc_cutout, api_doc_xmatch, api_doc_bayestar, api_doc_stats
from apps.api.doc import api_doc_cutout, api_doc_xmatch, api_doc_bayestar
from apps.api.doc import api_doc_stats, api_doc_random

from apps.api.utils import return_object_pdf, return_explorer_pdf
from apps.api.utils import return_latests_pdf, return_sso_pdf
from apps.api.utils import return_tracklet_pdf, format_and_send_cutout
from apps.api.utils import perform_xmatch, return_bayestar_pdf
from apps.api.utils import return_statistics_pdf, send_data
from apps.api.utils import return_random_pdf

import io
import requests
Expand Down Expand Up @@ -168,6 +170,17 @@ def layout(is_mobile):
),
], label="Statistics"
),
dbc.Tab(
[
dbc.Card(
dbc.CardBody(
dcc.Markdown(api_doc_random)
), style={
'backgroundColor': 'rgb(248, 248, 248, .7)'
}
),
], label="Random objects"
),
]
)
], className="mb-8", fluid=True, style={'width': width}
Expand Down Expand Up @@ -434,11 +447,39 @@ def layout(is_mobile):
'required': True,
'description': 'Observing date. This can be either a given night (YYYYMMDD), month (YYYYMM), year (YYYY), or eveything (empty string)'
},
{
'name': 'columns',
'required': False,
'description': 'Comma-separated data columns to transfer. Default is all columns.'
},
{
'name': 'output-format',
'required': False,
'description': 'Output format among json[default], csv, parquet, votable'
}
]

args_random = [
{
'name': 'n',
'required': True,
'description': 'Number of objects to return. Maximum is 16 for performance.'
},
{
'name': 'columns',
'required': False,
'description': 'Comma-separated data columns to transfer. Default is all columns. See {}/api/v1/columns for more information.'.format(APIURL)
},
{
'name': 'class',
'required': False,
'description': 'Fink derived class. Default is empty string, namely all classes are considered. See {}/api/v1/classes for more information'.format(APIURL)
},
{
'name': 'seed',
'required': False,
'description': 'Seed number for random number generator. By default, the seed is not fixed.'
},
{
'name': 'output-format',
'required': False,
Expand Down Expand Up @@ -829,3 +870,40 @@ def return_statistics(payload=None):

output_format = payload.get('output-format', 'json')
return send_data(pdf, output_format)

@api_bp.route('/api/v1/random', methods=['GET'])
def return_random_arguments():
""" Obtain information about retrieving random object data
"""
if len(request.args) > 0:
# POST from query URL
return return_random(payload=request.args)
else:
return jsonify({'args': args_random})

@api_bp.route('/api/v1/random', methods=['POST'])
def return_random(payload=None):
""" Retrieve random object data from the Fink database
"""
# get payload from the JSON
if payload is None:
payload = request.json

# Check all required args are here
required_args = [i['name'] for i in args_random if i['required'] is True]
for required_arg in required_args:
if required_arg not in payload:
rep = {
'status': 'error',
'text': "A value for `{}` is required. Use GET to check arguments.\n".format(required_arg)
}
return Response(str(rep), 400)

pdf = return_random_pdf(payload)

# Error propagation
if isinstance(pdf, Response):
return pdf

output_format = payload.get('output-format', 'json')
return send_data(pdf, output_format)
66 changes: 65 additions & 1 deletion apps/api/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@
| POST/GET | {}/api/v1/xmatch | Cross-match user-defined catalog with Fink alert data| ☑️ |
| POST/GET | {}/api/v1/bayestar | Cross-match LIGO/Virgo sky map with Fink alert data| ☑️ |
| POST/GET | {}/api/v1/statistics | Statistics concerning Fink alert data| ☑️ |
| POST/GET | {}/api/v1/random | Draw random objects from the Fink database| ☑️ |
| GET | {}/api/v1/classes | Display all Fink derived classification | ☑️ |
| GET | {}/api/v1/columns | Display all available alert fields and their type | ☑️ |
""".format(APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL)
""".format(APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL, APIURL)

api_doc_object = """
## Retrieve object data
Expand Down Expand Up @@ -1080,3 +1081,66 @@

All other fields starting with `class:` are crossmatch from the SIMBAD database.
""".format(pd.DataFrame([dic_names]).T.rename(columns={0: 'description'}).to_markdown())

api_doc_random = """
## Draw random objects

This service lets you draw random objects (full lightcurve) from the Fink database (120+ million alerts). This is still largely experimental.

The list of arguments for retrieving object data can be found at https://fink-portal.org/api/v1/random.

In a unix shell, you would simply use

```bash
# Get the data for 8 *objects* randomly drawn from the +120 million alerts in Fink
curl -H "Content-Type: application/json" -X POST -d '{"n":8, "output-format":"csv"}' https://fink-portal.org/api/v1/random -o random.csv

# you can also specify parameters in the URL, e.g. with wget:
wget "https://fink-portal.org/api/v1/random?n=8&output-format=json" -O random.json
```

In python, you would use

```python
import requests
import pandas as pd

r = requests.post(
'https://fink-portal.org/api/v1/random',
json={
'n': integer, # Number of random objects to get. Maximum is 16.
'class': classname, # Optional, specify a Fink class.
'seed': integer, # Optional, the seed for reproducibility
'columns': str, # Optional, comma-separated column names
'output-format': output_format, # Optional [json[default], csv, parquet, votable]
}
)

# Format output in a DataFrame
pdf = pd.read_json(r.content)
```

As this service is experimental, the number of random objects returned for a single
call cannot be greater than 16. Concerning the classname, see https://fink-portal.org/api/v1/classes.
If you do not specify the parameter `class`, you will get random objects from all classes.
For better performances, we advice to choose a classname, and limit colunms to transfer, e.g.:

```
# random Early SN Ia candidate
r = requests.post(
'https://fink-portal.org/api/v1/random',
json={
'n': 16, # Number of random objects to get
'class': 'Early SN Ia candidate', # Optional, specify a Fink class.
'seed': 0, # Optional, the seed for reproducibility
'columns': 'i:objectId,i:jd,i:magpsf,i:fid', # Optional, comma-separated column names
}
)
```

Note that this returns data for *objects* (and not just alerts).

Note also that the `seed` is used to fix the date boundaries, hence it is valid only over a small period of time as the database is updated everyday, and more dates are added...
So consider your seed valid over 24h (this might change in the future).

"""
104 changes: 103 additions & 1 deletion apps/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def return_explorer_pdf(payload: dict, user_group: int) -> pd.DataFrame:

return pdfs

def return_latests_pdf(payload: dict) -> pd.DataFrame:
def return_latests_pdf(payload: dict, return_raw: bool = False) -> pd.DataFrame:
""" Extract data returned by HBase and format it in a Pandas dataframe

Data is from /api/v1/latests
Expand All @@ -370,6 +370,8 @@ def return_latests_pdf(payload: dict) -> pd.DataFrame:
----------
payload: dict
See https://fink-portal.org/api/v1/latests
return_raw: bool
If True, return the HBase output, else pandas DataFrame. Default is False.

Return
----------
Expand Down Expand Up @@ -464,6 +466,9 @@ def return_latests_pdf(payload: dict) -> pd.DataFrame:
# Restore default limits
clientT.setLimit(nlimit)

if return_raw:
return results

# We want to return alerts
# color computation is disabled
pdfs = format_hbase_output(
Expand Down Expand Up @@ -1014,3 +1019,100 @@ def send_data(pdf, output_format):
'text': "Output format `{}` is not supported. Choose among json, csv, or parquet\n".format(output_format)
}
return Response(str(rep), 400)

def return_random_pdf(payload: dict) -> pd.DataFrame:
""" Extract data returned by HBase and format it in a Pandas dataframe

Data is from /api/v1/random

Parameters
----------
payload: dict
See https://fink-portal.org/api/v1/random

Return
----------
out: pandas dataframe
"""
if 'columns' in payload:
cols = payload['columns'].replace(" ", "")
else:
cols = '*'

if 'class' in payload and str(payload['class']) != "":
classsearch = True
else:
classsearch = False

if cols == '*':
truncated = False
else:
truncated = True

if int(payload['n']) > 16:
number = 16
else:
number = int(payload['n'])

seed = payload.get('seed', None)
if seed is not None:
np.random.seed(int(payload['seed']))

# logic
results = []
clientT.setLimit(1000)
clientT.setRangeScan(True)

jd_low = Time('2019-11-02 03:00:00.0').jd
jd_high = Time.now().jd

# 1 month
delta_min = 43200
delta_jd = TimeDelta(delta_min * 60, format='sec').jd
while len(results) == 0:
jdstart = np.random.uniform(jd_low, jd_high)
jdstop = jdstart + delta_jd

if classsearch:
payload_data = {
'class': payload['class'],
'n': number,
'startdate': Time(jdstart, format='jd').iso,
'stopdate': Time(jdstop, format='jd').iso,
'columns': "",
'output-format': 'json'
}
results = return_latests_pdf(payload_data, return_raw=True)
else:
results = clientT.scan(
"",
"key:key:{},key:key:{}".format(jdstart, jdstop),
"", 0, False, False
)

oids = list(dict(results).keys())
oids = np.array([i.split('_')[-1] for i in oids])

index_oid = np.random.randint(0, len(oids), number)
oid = oids[index_oid]

client.setLimit(2000)
# Get data from the main table
results = java.util.TreeMap()
for oid_ in oid:
result = client.scan(
"",
"key:key:{}".format(oid_),
"{}".format(cols),
0, False, False
)
results.putAll(result)

pdf = format_hbase_output(
results, client.schema(), group_alerts=False, truncated=truncated
)

clientT.setLimit(nlimit)
client.setLimit(nlimit)

return pdf
5 changes: 3 additions & 2 deletions tests/api_performance_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@

APIURL = sys.argv[1]

def classsearch(myclass='Solar System MPC', n=100000, startdate='2022-03-03', stopdate='2022-03-04', output_format='json'):
def classsearch(myclass='Solar System MPC', n=100000, startdate='2022-03-03', stopdate='2022-03-04', output_format='json', columns='*'):
""" Perform a heavy class search in the Science Portal using the Fink REST API
"""
payload = {
'class': myclass,
'n': n,
'columns': columns,
'output-format': output_format
}

Expand Down Expand Up @@ -61,7 +62,7 @@ def test_heavy_classsearch() -> None:
>>> test_heavy_classsearch()
"""
t0 = time.time()
pdf = classsearch()
pdf = classsearch(columns='i:objectId,i:magpsf,i:jd,d:rf_snia_vs_nonia')
dt = time.time() - t0

# less than 45 seconds to get 21,000 objects
Expand Down