# Intake for Bluesky

## Setup: Acquire some sample data.

For data acquisition (but not for data access!) we assume that we have direct access to MongoDB (or some message queue that has a sink into MongoDB).

In [1]:
from bluesky import RunEngine
from bluesky.plans import scan
from bluesky.preprocessors import SupplementalData
from ophyd.sim import motor, det, direct_img, img
from suitcase.mongo_layout1 import Serializer

RE = RunEngine({})
sd = SupplementalData(baseline=[motor])
RE.preprocessors.append(sd)

# This is just a simple callback that does MongoDB insert_one. No databroker.
metadatastore_uri = 'mongodb://localhost:27017/test1'
assets_uri = 'mongodb://localhost:27017/test1'
serializer = Serializer(metadatastore_uri, assets_uri)
RE.subscribe(serializer)


uid, = RE(scan([det], motor, -1, 1, 20))
direct_img_uid, = RE(scan([direct_img], motor, -1, 1, 20))

## Open an intake Catalog.

We could use intake to access the data _directly_ like this, though we will probably never do so at NSLS-II.

In [2]:
from intake_bluesky import MongoMetadataStoreCatalog

mds = MongoMetadataStoreCatalog(metadatastore_uri)
mds

<Intake catalog: mongodb://localhost:27017/test1>

In [3]:
run = mds[uid]

In [8]:
mds[direct_img_uid].primary().read().variables.keys()

include '[]'
exclude '[]'


KeysView(Frozen(OrderedDict([('time', <xarray.IndexVariable 'time' (time: 20)>
array([1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09,
       1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09,
       1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09,
       1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09, 1.549388e+09])), ('motor', <xarray.Variable (time: 20)>
array([-1.      , -0.894737, -0.789474, -0.684211, -0.578947, -0.473684,
       -0.368421, -0.263158, -0.157895, -0.052632,  0.052632,  0.157895,
        0.263158,  0.368421,  0.473684,  0.578947,  0.684211,  0.789474,
        0.894737,  1.      ])), ('motor_setpoint', <xarray.Variable (time: 20)>
array([-1.      , -0.894737, -0.789474, -0.684211, -0.578947, -0.473684,
       -0.368421, -0.263158, -0.157895, -0.052632,  0.052632,  0.157895,
        0.263158,  0.368421,  0.473684,  0.578947,  0.684211,  0.789474,
        0.894737,  1.      ])), ('img', <xarray.Varia

In [4]:
mds[direct_img_uid].primary().read().to_dataframe()

include '[]'
exclude '[]'


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,motor,motor_setpoint,img,motor:motor_velocity,motor:motor_acceleration,img:img,seq_num,uid
dim_0,dim_1,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1.549388e+09,-1.000000,-1.000000,1.0,1,1,1.0,1,b2c11348-8c37-4f6e-9361-a0bcaab56f80
0,0,1.549388e+09,-0.894737,-0.894737,1.0,1,1,1.0,2,9d200604-f1f3-461b-ae94-96b73dd6fcb5
0,0,1.549388e+09,-0.789474,-0.789474,1.0,1,1,1.0,3,48276490-666a-4939-9763-01c85b956455
0,0,1.549388e+09,-0.684211,-0.684211,1.0,1,1,1.0,4,1f3c4adc-e6ae-427d-8250-820ea83827e2
0,0,1.549388e+09,-0.578947,-0.578947,1.0,1,1,1.0,5,bc4d89a3-8065-4df3-9dcd-20233b4ec9de
0,0,1.549388e+09,-0.473684,-0.473684,1.0,1,1,1.0,6,151158f9-a5ff-4c3f-8d99-411cc6c9a1dc
0,0,1.549388e+09,-0.368421,-0.368421,1.0,1,1,1.0,7,a4bada03-cd57-453b-9b4f-ab05687e00f3
0,0,1.549388e+09,-0.263158,-0.263158,1.0,1,1,1.0,8,30e2094a-232a-4937-a9a0-a49e55beca27
0,0,1.549388e+09,-0.157895,-0.157895,1.0,1,1,1.0,9,7c63c40f-6baa-4ff0-afaf-10b9042d7f4f
0,0,1.549388e+09,-0.052632,-0.052632,1.0,1,1,1.0,10,45188685-4261-4ec3-8bf6-51a6c7a3958e


In [5]:
%cat facility_catalog.yml

plugins:
  source:
    - module: intake_bluesky
sources:
  xyz:
    description: Some imaginary beamline
    driver: mongo_metadatastore
    container: catalog
    args:
      uri: mongodb://localhost:27017/test1
    metadata:
      beamline: "00-ID"


In [6]:
import intake

facility_catalog = intake.Catalog("intake://localhost:5000", page_size=100)
facility_catalog

ConnectionError: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /v1/info?page_offset=0&page_size=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f74687d4dd8>: Failed to establish a new connection: [Errno 111] Connection refused',))

A Catalog contains entries, which we can access by iteration:

```
for entry in catalog:
    ...
```

or individually by name:

```
entry = catalog[entry_name]
```

For small Catalogs, it is convenient to ``list`` their contents.

In [42]:
list(facility_catalog)

['xyz']

The ``facility_catalog`` contains a catalog for each beamline. Let's access the ``xyz`` entry, which is also a Catalog.

In [43]:
cat = facility_catalog['xyz']()
cat

<Intake catalog: xyz>

In [44]:
cat[uid]

<Catalog Entry: 96c8339e-f5b8-44d5-8c7a-d789a6e66c71>

Each entry in this Catalog represents one scan. There are too many to list them all. (We could _try_ but it would take a long time and probably run out of memory.)

We can find scans of interest in a couple ways.

## Progressive Search

We can search ``cat`` by passing it a Mongo Query. The result is another Catalog, with a subset of the entries in ``cat``.

In [45]:
search_results = cat.search({'plan_name': 'scan'})
search_results

<Intake catalog: >

We can progressively serach, generating yet another Catalog.

In [46]:
import time
recent_counts = search_results.search({'time': {'$gt': time.time() - 60 * 60 * 24}})
recent_counts

<Intake catalog: >

Having narrowed the results to a small Catalog, we can list them.

In [47]:
list(recent_counts)

['6fb01404-ebe3-49bd-bd53-a0e2540c2589',
 '96c8339e-f5b8-44d5-8c7a-d789a6e66c71',
 '40e55919-0949-47cd-b127-d25bbdc29fa0',
 '42aa99a8-083a-48a1-9a0e-f126fa5ff0a4',
 '9c7ca894-af60-4c19-9f1a-88ff104a40bd',
 '92b2ff95-cbf8-4f4e-acf4-feb9c6095a2b',
 'b0528e64-3bf7-4e3d-b638-537024652115',
 'c5922a53-6272-4431-9a59-809ec0b6af1b',
 '89433f0d-a948-4151-b8c7-24ffc053df8b',
 '58684447-46c8-483a-b765-6fe09057384e']

## Random access by unique ID (`uid`), recency, and `scan_id`

We can access entries by their unique ID "name" as in:

In [48]:
entry = cat[uid]  # uid we captured above during data acquisition
entry

<Catalog Entry: 96c8339e-f5b8-44d5-8c7a-d789a6e66c71>

We can also access entries by *recency* with this syntactic sugar:

In [49]:
recent_counts[-1]

<Catalog Entry: -1>

A positive integer matches the most recent entry with the corresponding ``scan_id`` (not necessarily globally unique!)

In [50]:
cat[1]

<Catalog Entry: 1>

Both of these "tricks" are _not_ general features of intake Catalogs, but as shown we can support them, for the sake of convenience and of continuity with databroker usage patterns.

## Metadata

The entry's metadata is available via ``entry.metadata``. Notice that this includes ``entry.metadata.start`` and ``entry.metadata.stop``, the documents generated at the beginning and end of the corresponding scan.

In [51]:
entry.metadata

{'start': {'uid': '96c8339e-f5b8-44d5-8c7a-d789a6e66c71',
  'time': 1549326803.9287987,
  'plan_pattern_args': {'num': 20,
   'args': ["SynAxis(prefix='', name='motor', read_attrs=['readback', 'setpoint'], configuration_attrs=['velocity', 'acceleration'])",
    -1,
    1]},
  'scan_id': 1,
  'plan_type': 'generator',
  'plan_pattern_module': 'bluesky.plan_patterns',
  'plan_args': {'detectors': ["SynGauss(name='det', value=0.6065306597126334, timestamp=1549326743.887361)"],
   'num': 20,
   'args': ["SynAxis(prefix='', name='motor', read_attrs=['readback', 'setpoint'], configuration_attrs=['velocity', 'acceleration'])",
    -1,
    1],
   'per_step': 'None'},
  'num_intervals': 19,
  'hints': {'dimensions': [[['motor'], 'primary']]},
  'plan_name': 'scan',
  'detectors': ['det'],
  'num_points': 20,
  'plan_pattern': 'inner_product',
  'motors': ['motor']},
 'stop': {'run_start': '96c8339e-f5b8-44d5-8c7a-d789a6e66c71',
  'time': 1549326803.991711,
  'uid': '33227b60-1149-4ba1-8af9-2547

## Accessing Data

Calling an Entry like `entry()` or equivalently `entry.get()` returns the DataSource for that Entry. The DataSource corresponding to one scan is itself a Catalog, named for with the `uid`.

In [52]:
entry()

<Intake catalog: 96c8339e-f5b8-44d5-8c7a-d789a6e66c71>

That Catalog has one entry for each stream of data captured during that scan. (Typically there is a ``'primary'`` stream and potentially others, but this is just a convention.) As with all Catalogs, we can look at its contents.

In [53]:
list(entry())

['baseline', 'primary']

We can pull the data from the 'primary' stream all at once:

In [54]:
entry().primary().read()

<xarray.Dataset>
Dimensions:                   (time: 20)
Coordinates:
  * time                      (time) float64 1.549e+09 1.549e+09 ... 1.549e+09
Data variables:
    det                       (time) float64 0.6065 0.6701 ... 0.6701 0.6065
    det:det                   (time) float64 0.6065 0.6065 ... 0.6065 0.6065
    motor                     (time) float64 -1.0 -0.8947 -0.7895 ... 0.8947 1.0
    motor:motor_acceleration  (time) int64 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1
    motor:motor_velocity      (time) int64 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1
    motor_setpoint            (time) float64 -1.0 -0.8947 -0.7895 ... 0.8947 1.0
    seq_num                   (time) int64 1 2 3 4 5 6 7 ... 15 16 17 18 19 20
    uid                       (time) <U36 '3300a0ce-5cb9-48f2-a2c6-534cfc9658fa' ... 'c8bed238-0a0e-4fc4-8e31-d5c7c6c77d1d'

At this point, we have "left" intake. We have an ordinary `xarray.Dataset` object, which we can use to do any further slicing or drilling down. This `Dataset` contains numpy arrays. Alternatively, we can ask intake for a `Dataset` of _dask_ arrays, which will defer pulling the data from the server until called up to compute a result.

In [55]:
entry().primary().to_dask()  # an xarray of dask.arrays

<xarray.Dataset>
Dimensions:                   (time: 20)
Coordinates:
  * time                      (time) float64 1.549e+09 1.549e+09 ... 1.549e+09
Data variables:
    det                       (time) float64 dask.array<shape=(20,), chunksize=(20,)>
    det:det                   (time) float64 dask.array<shape=(20,), chunksize=(20,)>
    motor                     (time) float64 dask.array<shape=(20,), chunksize=(20,)>
    motor:motor_acceleration  (time) int64 dask.array<shape=(20,), chunksize=(20,)>
    motor:motor_velocity      (time) int64 dask.array<shape=(20,), chunksize=(20,)>
    motor_setpoint            (time) float64 dask.array<shape=(20,), chunksize=(20,)>
    seq_num                   (time) int64 dask.array<shape=(20,), chunksize=(20,)>
    uid                       (time) object dask.array<shape=(20,), chunksize=(20,)>

For example, converting the `xarray.Dataset` to a `pandas.DataFrame` will prompt dask to materialize the data:

In [56]:
entry().primary().to_dask().to_dataframe()

Unnamed: 0_level_0,det,det:det,motor,motor:motor_acceleration,motor:motor_velocity,motor_setpoint,seq_num,uid
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1549327000.0,0.606531,0.606531,-1.0,1,1,-1.0,1,3300a0ce-5cb9-48f2-a2c6-534cfc9658fa
1549327000.0,0.670134,0.606531,-0.894737,1,1,-0.894737,2,b113a43c-7db4-41e9-bae7-68caa3f46fd8
1549327000.0,0.732249,0.606531,-0.789474,1,1,-0.789474,3,f1695f04-255d-4cfe-ab17-70fc78bdeaf7
1549327000.0,0.791305,0.606531,-0.684211,1,1,-0.684211,4,22cc8c0a-b226-4069-bba0-8e6e228a3910
1549327000.0,0.8457,0.606531,-0.578947,1,1,-0.578947,5,ef2946d4-e81a-44eb-9337-5f9f8df0690d
1549327000.0,0.893876,0.606531,-0.473684,1,1,-0.473684,6,32684675-09aa-44bd-924c-aec14801808f
1549327000.0,0.934385,0.606531,-0.368421,1,1,-0.368421,7,8e0cd13b-885f-48ce-a809-cd4402983907
1549327000.0,0.965967,0.606531,-0.263158,1,1,-0.263158,8,a0022f4a-5823-4347-b891-82e876a2401e
1549327000.0,0.987612,0.606531,-0.157895,1,1,-0.157895,9,84db1acd-6bca-4d43-bf01-fc985e04246d
1549327000.0,0.998616,0.606531,-0.052632,1,1,-0.052632,10,1fad2e75-11cc-4593-9e19-0ef39bfea063


We can look at the data from the other stream, 'baseline'.

In [57]:
entry().baseline().read()

<xarray.Dataset>
Dimensions:                   (time: 2)
Coordinates:
  * time                      (time) float64 1.549e+09 1.549e+09
Data variables:
    motor                     (time) float64 1.0 1.0
    motor:motor_acceleration  (time) int64 1 1
    motor:motor_velocity      (time) int64 1 1
    motor_setpoint            (time) float64 1.0 1.0
    seq_num                   (time) int64 1 2
    uid                       (time) <U36 '121c1eea-c106-4c12-a2ee-9a33741a9372' 'd15df9f1-89be-40f1-bf64-811840f1a513'

Or merge all the stream together into one `xarray.Dataset`:

In [58]:
import xarray

xarray.merge(entry()[key].read() for key in entry())

<xarray.Dataset>
Dimensions:                   (time: 22)
Coordinates:
  * time                      (time) float64 1.549e+09 1.549e+09 ... 1.549e+09
Data variables:
    motor                     (time) float64 1.0 -1.0 -0.8947 ... 0.8947 1.0 1.0
    motor:motor_acceleration  (time) float64 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0
    motor:motor_velocity      (time) float64 1.0 1.0 1.0 1.0 ... 1.0 1.0 1.0 1.0
    motor_setpoint            (time) float64 1.0 -1.0 -0.8947 ... 0.8947 1.0 1.0
    seq_num                   (time) float64 1.0 1.0 2.0 3.0 ... 19.0 20.0 2.0
    uid                       (time) object '121c1eea-c106-4c12-a2ee-9a33741a9372' ... 'd15df9f1-89be-40f1-bf64-811840f1a513'
    det                       (time) float64 nan 0.6065 0.6701 ... 0.6065 nan
    det:det                   (time) float64 nan 0.6065 0.6065 ... 0.6065 nan

which creates a "block matrix" sorted on time, clearly visible when cast into a DataFrame:

In [59]:
xarray.merge(entry()[key].read() for key in entry()).to_dataframe()

Unnamed: 0_level_0,motor,motor:motor_acceleration,motor:motor_velocity,motor_setpoint,seq_num,uid,det,det:det
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1549327000.0,1.0,1.0,1.0,1.0,1.0,121c1eea-c106-4c12-a2ee-9a33741a9372,,
1549327000.0,-1.0,1.0,1.0,-1.0,1.0,3300a0ce-5cb9-48f2-a2c6-534cfc9658fa,0.606531,0.606531
1549327000.0,-0.894737,1.0,1.0,-0.894737,2.0,b113a43c-7db4-41e9-bae7-68caa3f46fd8,0.670134,0.606531
1549327000.0,-0.789474,1.0,1.0,-0.789474,3.0,f1695f04-255d-4cfe-ab17-70fc78bdeaf7,0.732249,0.606531
1549327000.0,-0.684211,1.0,1.0,-0.684211,4.0,22cc8c0a-b226-4069-bba0-8e6e228a3910,0.791305,0.606531
1549327000.0,-0.578947,1.0,1.0,-0.578947,5.0,ef2946d4-e81a-44eb-9337-5f9f8df0690d,0.8457,0.606531
1549327000.0,-0.473684,1.0,1.0,-0.473684,6.0,32684675-09aa-44bd-924c-aec14801808f,0.893876,0.606531
1549327000.0,-0.368421,1.0,1.0,-0.368421,7.0,8e0cd13b-885f-48ce-a809-cd4402983907,0.934385,0.606531
1549327000.0,-0.263158,1.0,1.0,-0.263158,8.0,a0022f4a-5823-4347-b891-82e876a2401e,0.965967,0.606531
1549327000.0,-0.157895,1.0,1.0,-0.157895,9.0,84db1acd-6bca-4d43-bf01-fc985e04246d,0.987612,0.606531


## N-dimensional Data (e.g. images)

Higher-dimensional data does not have to be treated specially. It can sit in an `xarray.Dataset` as well. As above, we can use `read()` to fetch the data immediately or `to_dask()` to fetch it lazily.

In [60]:
entry = cat[direct_img_uid]  # uid captured during data acquisition above
dataset = entry().primary().read()
dataset

<xarray.Dataset>
Dimensions:                   (dim_0: 10, dim_1: 10, time: 20)
Coordinates:
  * time                      (time) float64 1.549e+09 1.549e+09 ... 1.549e+09
Dimensions without coordinates: dim_0, dim_1
Data variables:
    img                       (time, dim_0, dim_1) float64 1.0 1.0 ... 1.0 1.0
    img:img                   (time, dim_0, dim_1) float64 1.0 1.0 ... 1.0 1.0
    motor                     (time) float64 -1.0 -0.8947 -0.7895 ... 0.8947 1.0
    motor:motor_acceleration  (time) int64 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1
    motor:motor_velocity      (time) int64 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1
    motor_setpoint            (time) float64 -1.0 -0.8947 -0.7895 ... 0.8947 1.0
    seq_num                   (time) int64 1 2 3 4 5 6 7 ... 15 16 17 18 19 20
    uid                       (time) <U36 '273a9335-10c9-44dc-ab93-5011160483e8' ... 'da251f50-d6a6-4b14-8098-4bdd6191ae3c'

## The `xarray.Dataset` is a very useful container.

It has a nice string representation, as shown above. We can access specific dimensions:

In [61]:
dataset['img']

<xarray.DataArray 'img' (time: 20, dim_0: 10, dim_1: 10)>
array([[[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        ...,
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]],

       [[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        ...,
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]],

       ...,

       [[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        ...,
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]],

       [[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        ...,
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]]])
Coordinates:
  * time     (time) float64 1.549e+09 1.549e+09 ... 1.549e+09 1.549e+09
Dimensions without coordinates: dim_0, dim_1

Do math along named dimensions:

In [62]:
dataset['img'].sum('time')

<xarray.DataArray 'img' (dim_0: 10, dim_1: 10)>
array([[20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.],
       [20., 20., 20., 20., 20., 20., 20., 20., 20., 20.]])
Dimensions without coordinates: dim_0, dim_1

Slice along named dimensions:

In [63]:
dataset['img'].sel(dim_0=slice(0, 3), dim_1=slice(5, 10))

<xarray.DataArray 'img' (time: 20, dim_0: 3, dim_1: 5)>
array([[[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]],

       [[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]],

       ...,

       [[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]],

       [[1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.],
        [1., 1., ..., 1., 1.]]])
Coordinates:
  * time     (time) float64 1.549e+09 1.549e+09 ... 1.549e+09 1.549e+09
Dimensions without coordinates: dim_0, dim_1