TODO List:

1) mongo dump / reload on a local machine

2) get unique set of ObjectIDs in event.descriptor_id

3) get all of the run start/run_stop pairs and their start times

4) for each set of events for a given unique ObjectId compute the average time

5) sort out which start/stop window the average falls in (might have to pad the start-stops out a bit in time)

6) use that association to build new event descriptors

7) look at the events in each set and sort out what the data keys are and if you can (easily) extract the shape

8) get a list of all of the keys that were used (which will probably be <30) and work with beamline to get mapping back to PVs to reconstruct the source information

In [1]:
from pymongo import MongoClient
from collections import deque

In [2]:
MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MIGRATION_DB = 'datastore2'
# 1) mongo dump / reload on a local machine
pymongo_client = MongoClient(MONGO_HOST, MONGO_PORT)
database = pymongo_client['datastore2']

In [3]:
#2) get unique set of ObjectIDs in event.descriptor_id
desc_oids = list()
desc_oids = database.event.distinct('descriptor_id')
print(len(desc_oids))

13801


In [4]:
# 3) get all of the run start/run_stop pairs and their start times
rstt_crsr = database.run_start.find()
pairs = dict()
for rstart in rstt_crsr:
    try:
        run_stop = next(database.run_stop.find({'run_start_id': rstart['_id']}))
    except StopIteration:
        run_stop = None # there are some rstop that are not created 
    if run_stop:
        time_range = (rstart['time'],run_stop['time'])
        pairs[rstart['_id']] = time_range

In [None]:
# 4) for each set of events for a given unique 
# ObjectId compute the average time
def compute_average(descriptor_ids):
    descriptors = {}
    for entry in descriptor_ids:
        ev_crsr = database.event.find({'descriptor_id': entry})
        sum, cnt = 0, 0
        for ev in ev_crsr:
            sum += ev['time']
            cnt += 1
        avg = sum / cnt
        descriptors['id'] = (entry, avg)
    return descriptors

In [5]:
# 5) sort out which start/stop window the average 
# falls in (might have to pad the start-stops out a bit in time)
rs_desc_pairs = {}
for k, v in pairs.items():
    # give me all the distinct event descriptors for events in the
    # time range between start and stop
    # TODO: Figure out what to do for the orphan descriptors
    query = {'time': {'$gt': v[0],'$lt': v[1]}}
    rs_desc_pairs[k] = database.event.find(query).distinct(key='descriptor_id')

In [6]:
# 6) use that association to build new event descriptors
from pymongo.errors import OperationFailure
for k,v in rs_desc_pairs.items():
    for _ in list(v):
        try:
            database.event_descriptor.insert({'run_start_id':k, '_id':_})
        except OperationFailure:
            # this is the crap that overlaps
            # seems like two consecutive run_starts created right after 
            print(k, next(database.event_descriptor.find({'_id': _})))

558366de7368e3b3b857fbf2 {'run_start_id': ObjectId('558368220712a64e78d58c53'), '_id': ObjectId('558366e77368e3b3b857fbf3')}
558366de7368e3b3b857fbf2 {'run_start_id': ObjectId('558368220712a64e78d58c53'), '_id': ObjectId('558368250712a64e78d58c54')}
558366de7368e3b3b857fbf2 {'run_start_id': ObjectId('558367ea0712a64e78d58c4b'), '_id': ObjectId('558367ed0712a64e78d58c4c')}
558366de7368e3b3b857fbf2 {'run_start_id': ObjectId('5583676b0712a64e78d58c43'), '_id': ObjectId('5583676e0712a64e78d58c44')}
54e64ce324467976d380b00e {'run_start_id': ObjectId('54e64ce324467976d380afec'), '_id': ObjectId('54e64ce324467976d380b00f')}
54e64ce324467976d380b00e {'run_start_id': ObjectId('54e64ce324467976d380afec'), '_id': ObjectId('54e64ce324467976d380afed')}
558334e97368e3a5578ab81d {'run_start_id': ObjectId('5583352b0712a63780f7ccdb'), '_id': ObjectId('558334ed7368e3a5578ab81e')}
558334e97368e3a5578ab81d {'run_start_id': ObjectId('5583352b0712a63780f7ccdb'), '_id': ObjectId('5583352e0712a63780f7ccdc')}


In [7]:
# 7) look at the events in each set and sort out what the data keys are and if you can (easily) extract the shape

from bson import ObjectId

database.event.find({"descriptor_id" : ObjectId("54e3c8537368e3237a36b299"),}).distinct(key='data')[0]

{'sclr_ch1': [100000005.0, 1424213954.194121],
 'sclr_ch2': [10097.0, 1424213954.194121],
 'sclr_ch3': [7665.0, 1424213954.194121],
 'sclr_ch4': [2735.0, 1424213954.194121],
 'sclr_ch5': [9645.0, 1424213954.194121],
 'sclr_ch6': [3930.0, 1424213954.194121],
 'theta': [-1.0000950000000017, 1424214097.306621]}

In [None]:
# 8) get a list of all of the keys that were used (which will probably be <30) and work with beamline to get mapping back to 
# PVs to reconstruct the source information