In [1]:
from zipfile import ZipFile
import os
import pandas as pd
from otter import Otter
from astropy.coordinates import SkyCoord
import astropy.units as u

# Testing upload_zip
Use caution running this so we don't exceed the ADS query limits!

In [2]:
zipfile = os.path.join(os.path.dirname(os.getcwd()), 'data', 'test-zip.zip')
datadir = os.path.join(os.path.dirname(os.getcwd()), 'data')
datapath = os.path.join(datadir, os.path.basename(zipfile).replace('.zip', ''))

with ZipFile(zipfile) as z:
    z.extractall(datadir)

In [3]:
#db = Otter(username='admin@otter', password='insecure')

In [4]:
#db.upload_zip(zipfile)

Found this object in the database already, merging the data...






# Testing merging

In [5]:
from otter import Otter, Transient
from copy import deepcopy
from collections import Counter
import awkward as ak
import warnings
import numpy as np
import re
from astropy.coordinates import SkyCoord
import json

# generate some test cases
db = Otter()
t1 = db.query(names='2022xkq')[0] # 
t2 = deepcopy(t1)
print(t1.keys())

# change t2 for testing
t2['name'] = {'default_name':'2022xkq',
             'alias': [{'value':'foo', 'reference': 'x'},
                      {'value': '2022xkq', 'reference': 'x'}]}
t2['reference_alias'].append({'name': 'x',
   'human_readable_name': 'test, name (year)'}) # add an extra value
del t2['photometry']
t2['for_test'] = {'test': 'bar'} # add a test key that isn't in t1
t2['coordinate/equitorial'][0]['reference'] = 'noah'
t2['filter_alias'].append({'filter_key': 'foo'})
t2['schema_version/value'] = 100
t2['epoch'] = {'date_peak': [{'value': 56983,
    'date_format': 'MJD',
    'reference': ['2016ApJ...819L..25A',
     '2016Sci...351...62V',
     '2016ApJ...832L..10R',
     '2018MNRAS.475.4011B'],
    'computed': False}],
               
               'date_discovery': [{'value': 56983,
    'date_format': 'MJD',
    'reference': ['2016ApJ...819L..25A',
     '2016Sci...351...62V',
     '2016ApJ...832L..10R',
     '2018MNRAS.475.4011B'],
    'computed': False}],
               
               
              'date_discovery': [{'value': 56984,
    'date_format': 'MJD',
    'reference': ['2016ApJ...819L..25A',
     '2016Sci...351...62V',
     '2016ApJ...832L..10R',
     '2018MNRAS.475.4011B'],
    'computed': False}]
              }

t2['distance'] = {
    "redshift": [
      {
        "value": "0.0207",
        "reference": [
          "Noah"
        ],
        "computed": False
      },
        {
        "value": "0.02",
        "reference": [
          "Noah"
        ],
        "computed": False
      }
    ],
    
    "dispersion_measure": [
      {
        "value": "0.0206",
        "reference": [
          "Noah"
        ],
        "computed": False
      }
    ]
  }
  
t2['classification'] = [{'object_class':'SN',
                        'confidence': 1,
                         'reference': 'Noah'
                        }]
    
t2['photometry'] = {'phot_0': {'telescope': 'Noahs Telescope',
                               'reference': 'Noah',
                               'flux': [{'filter': 'z',
                                 'telescope': 'Noahs Telescope',
                                 'upperlimit': True,
                                 'date': 59864.4914116667,
                                 'date_format': 'MJD',
                                 'raw': 20.01,
                                 'raw_units': 'mag(AB)',
                                 'filter_key': 'NoahsTelescope.z',
                                 'obs_type': 'uvoir'}]},
                    'phot_1': {'telescope': 'CAHA',
                               'reference': 'Noah',
                               'flux': [{'filter': 'H',
                                 'telescope': 'CAHA',
                                 'upperlimit': False,
                                 'date': 59898.12077,
                                 'date_format': 'MJD',
                                 'raw': 14.87048,
                                 'raw_err': 0.0187,
                                 'raw_units': 'mag(AB)',
                                 'filter_key': 'CAHA.H',
                                 'obs_type': 'uvoir'}]}
            
                               }

dict_keys(['_key', '_id', '_rev', 'schema_version', 'name', 'coordinate', 'reference_alias', 'photometry', 'filter_alias'])


In [2]:
print(json.dumps(dict(t2 + t1), indent=4))

{
    "coordinate": {
        "equitorial": [
            {
                "ra": 76.348792,
                "dec": -11.88225,
                "ra_units": "deg",
                "dec_units": "deg",
                "computed": false,
                "uuid": "1fa868ef-6958-45f9-80a9-fa947352bf61",
                "default": true,
                "reference": [
                    "2023arXiv230910054P",
                    "noah"
                ]
            }
        ],
        "galactic": [
            {
                "l": 211.89010004494574,
                "b": -28.85761791359529,
                "l_units": "deg",
                "b_units": "deg",
                "reference": [
                    "1fa868ef-6958-45f9-80a9-fa947352bf61"
                ],
                "computed": true
            }
        ]
    },
    "name": {
        "default_name": "2022xkq",
        "alias": [
            {
                "value": "foo",
                "reference": [
                    "x

# Testing upload

In [6]:
db = Otter(username='admin@otter', password='insecure')
db.upload(t2)

Found this object in the database already, merging the data...


# EVERYTHING BELOW THIS LINE WAS JUST FOR DEVELOPMENT

In [7]:
def _merge_names(t1, t2, out):
    '''
    Private method to merge the name data in t1 and t2 and put it in out
    '''
    key = 'name'
    out[key] = {}
        
    # first deal with the default_name key
    # we are gonna need to use some regex magic to choose a preferred default_name
    if t1[key]['default_name'] == t2[key]['default_name']:
        out[key]['default_name'] = t1[key]['default_name']
    else:
        # we need to decide which default_name is better
        # it should be the one that matches the TNS style
        # let's use regex
        n1 = t1[key]['default_name']
        n2 = t2[key]['default_name']

        # write some discriminating regex expressions
        exp1 = '^[0-9]' # starts with a number, this is preferred because it is TNS style
        exp2 = '.$' # ends with any character, this is also preferred because it is TNS style
        exp3 = '^[0-9]{3}' # checks if first four characters are a number, like a year :), this is pretty strict though
        exp4 = '^AT' # checks if it starts with AT like TNS names
        exps = [exp1, exp2, exp3, exp4]

        # score each default_name based on this
        score1 = 0
        score2 = 0
        for e in exps:
            re1 = re.findall(e, n1)
            re2 = re.findall(e, n2)
            if re1:
                score1 += 1
            if re2:
                score2 += 1

        # assign a default_name based on the score
        if score1 > score2: 
            out[key]['default_name'] = t1[key]['default_name']
        elif score2 > score1:
            out[key]['default_name'] = t2[key]['default_name']
        else:
            warnings.warn('Names have the same score! Just using the existing default_name')
            out[key]['default_name'] = t1[key]['default_name']

    # now deal with aliases
    # create a reference mapping for each
    t1map = {}
    for val in t1[key]['alias']:
        ref = val['reference']
        if isinstance(ref, str):
            t1map[val['value']] = [ref]
        else:
            t1map[val['value']] = [ref]

    t2map = {}
    for val in t2[key]['alias']:
        ref = val['reference']
        if isinstance(ref, str):
            t2map[val['value']] = [ref]
        else:
            t2map[val['value']] = [ref]

    # figure out which ones we need to be careful with references in        
    inboth = list(t1map.keys() & t2map.keys()) # in both so we'll have to merge the reference key
    int1 = list(t1map.keys() - t2map.keys()) # only in t1
    int2 = list(t2map.keys() - t1map.keys()) # only in t2

    # add ones that are not in both first, these are easy
    L1 = [{'value':k, 'reference':t1map[k]} for k in int1]
    L2 = [{'value':k, 'reference':t2map[k]} for k in int2]
    Lboth = [{'value':k, 'reference':t1map[k]+t2map[k]} for k in inboth]
    out[key]['alias'] =  L1+L2+Lboth

In [8]:
def _merge_coords(t1, t2, out):
    '''
    Merge the coordinates subdictionaries for t1 and t2 and put it in out
    '''
    key = 'coordinate'
    out[key] = {}
    
    # first deal with equitorial and then galactic
    subkeys = ['equitorial', 'galactic']
    cnames = [('ra', 'dec', 'icrs'), ('l', 'b', 'galactic')]
    for subkey, c in zip(subkeys, cnames):
        
        c1, c2, frame = c
        c1_units, c2_units = f'{c1}_units', f'{c2}_units'
        
        if subkey in t1[key] and subkey in t2[key]:
            out[key][subkey] = t1[key][subkey]
            curr_coords = np.array([SkyCoord(val[c1], val[c2], unit=(val[c1_units], val[c2_units]), frame=frame) for val in t1[key][subkey]])
            for coord in t2[key][subkey]:
                coorddict = {c1:coord[c1],
                             c2:coord[c2],
                             'unit':(coord[c1_units], coord[c2_units]),
                             'frame': frame
                            }
                skycoord = SkyCoord(**coorddict)
                if skycoord not in curr_coords:
                    out[key][subkey].append(coord)
                else:
                    idx = np.where(skycoord == curr_coords)[0][0] # we only need the first value
                    ref = out[key][subkey][idx]['reference']
                    if not isinstance(ref, list):
                        out[key][subkey][idx]['reference'] = [ref]
                    
                    if not isinstance(coord['reference'], list):
                        coord['reference'] = [coord['reference']]
                    
                    newdata = list(np.unique(out[key][subkey][idx]['reference']+coord['reference']))
                    out[key][subkey][idx]['reference'] = newdata

        elif subkey in t1[key]:
            out[key][subkey] = t1[key][subkey]

        elif subkey in t2[key]:
            out[key][subkey] = t2[key][subkey]

In [9]:
def _merge_filter_alias(t1, t2, out):
    '''
    Combine the filter alias lists across the transient objects
    '''
    
    key = 'filter_alias'
    
    out[key] = deepcopy(t1[key])
    keys1 = {filt['filter_key'] for filt in t1[key]}
    for filt in t2[key]:
        if filt['filter_key'] not in keys1:
            out[key].append(filt)

In [10]:
def _merge_schema_version(t1, t2, out):
    '''
    Just keep whichever schema version is greater
    '''
    key = 'schema_version/value'
    if int(t1[key]) > int(t2[key]):
        out['schema_version'] = deepcopy(t1['schema_version'])
    else:
        out['schema_version'] = deepcopy(t2['schema_version'])

In [11]:
def _merge_photometry(t1, t2, out):
    '''
    Combine photometry sources
    '''
    
    key = 'photometry'
    
    out[key] = deepcopy(t1[key])
    
    idx = int(list(out[key].keys())[-1][-1])+1
    telescopes = np.array([phot['telescope'] for phot in out[key].values() if 'telescope' in phot])
    refs = np.array([phot['reference'] for phot in out[key].values() if 'reference' in phot])
    for phot in t2[key].values():

        if len(telescopes) > 0 and 'telescope' in phot and phot['telescope'] in telescopes:
            i = np.where(phot['telescope'] == telescopes)[0][0]
            toappend = out[key][f'phot_{i}']
        elif len(refs) > 0 and 'reference' in phot and phot['reference'] in refs:
            i = np.where(phot['reference'] == refs)[0][0]
            toappend = out[key][f'phot_{i}']
        else:
            # nothing with this telescope has been added
            out[key][f'phot_{idx}'] = phot
            idx += 1
            continue
            
        # if the code has gotten here we need to append to an existing list of photometry
        for point in phot['flux']:
            if point not in toappend['flux']:
                toappend['flux'].append(point)
            else:
                if not isinstance(toappend['reference'], list):
                    toappend['reference'] = [toappend['reference']]
                    
                if not isinstance(phot['reference'], list):
                    phot['reference'] = [phot['reference']]
                
                if phot['reference'] not in toappend['reference']:    
                    newdata = list(np.unique(toappend['reference']+phot['reference']))
                    toappend['reference'] = newdata

In [12]:
def _merge_spectra(t1, t2, out):
    '''
    Combine spectra sources
    '''
    pass

In [13]:
def _merge_class(t1, t2, out):
    '''
    Combine the classification attribute
    '''
    key = 'classification'
    out[key] = deepcopy(t1[key])
    classes = np.array([item['object_class'] for item in out[key]])
    for item in t2[key]:
        if item['object_class'] in classes:
            i = np.where(item['object_class'] == classes)[0][0]
            if int(item['confidence']) > int(out[key][i]['confidence']):
                out[key][i]['confidence'] = item['confidence'] # we are now more confident
            
            if not isinstance(out[key][i]['reference']):
                out[key][i]['reference'] = [out[key][i]['reference']]
            
            if not isinstance(item['reference']):
                item['reference'] = [item['reference']]
            
            newdata = list(np.unique(out[key][i]['reference']+item['reference']))
            out[key][i]['reference'] = newdata
            
        else:
            out[key].append(item)
            
    # now that we have all of them we need to figure out which one is the default
    maxconf = max(out[key], key=lambda d: d['confidence'])  
    for item in out[key]:
        if item == maxconf:
            item['default'] = True
        else:
            item['default'] = False

In [14]:
def _merge_epoch(t1, t2, out):
    '''
    Combine epoch data across two transients and write it to "out"
    '''
    key = 'epoch'
    subkeys = ['date_explosion', 'date_peak', 'date_discovery']
    
    out[key] = {}
    
    for subkey in subkeys:
        if subkey in t1[key] and subkey in t2[key]:
            out[key][subkey] = t1[key][subkey]
            values = np.array([val['value'] for val in out[key][subkey]])
            for item in t2[key][subkey]:
                if item['value'] in values:
                    i = np.where(item['value'] == values)[0][0]
                    if not isinstance(out[key][subkey][i]['reference'], list):
                        out[key][subkey][i]['reference'] = [out[key][subkey][i]['reference']]
                    if not isinstance(item['reference'], list):
                        item['reference'] = [item['reference']]
                    
                    out[key][subkey][i]['reference'] = list(np.unique(out[key][subkey][i]['reference']+item['reference']))
                else:
                    out[key][subkey].append(item)
                        
        elif subkey in t1[key]:
            out[key][subkey] = t1[key][subkey]
        
        elif subkey in t2[key]:
            out[key][subkey] = t2[key][subkey]

In [15]:
def _merge_distance(t1, t2, out):
    '''
    Combine distance information for these two transients
    '''
    key = 'distance'
    subkeys = ['redshift', 'luminosity_distance', 'dispersion_measure']
    out[key] = {}
    for subkey in subkeys:
        if subkey in t1[key] and subkey in t2[key]:
            out[key][subkey] = t1[key][subkey]
            values = np.array([val['value'] for val in out[key][subkey]])
            for item in t2[key][subkey]:
                if item['value'] in values:
                    i = np.where(item['value'] == values)[0][0]
                    if not isinstance(out[key][subkey][i]['reference'], list):
                        out[key][subkey][i]['reference'] = [out[key][subkey][i]['reference']]
                    if not isinstance(item['reference'], list):
                        item['reference'] = [item['reference']]
                    
                    out[key][subkey][i]['reference'] = list(np.unique(out[key][subkey][i]['reference']+item['reference']))
                else:
                    out[key][subkey].append(item)
        
        elif subkey in t1[key]:
            out[key][subkey] = t1[key][subkey]
        
        elif subkey in t2[key]:
            out[key][subkey] = t2[key][subkey]

In [16]:
# THIS IS WHERE THE ACTUAL SOFTWARE WILL START!!!!
# define an output dictionary
out = {}

# find the keys that are 
merge_keys = list(t1.keys() & t2.keys()) # in both t1 and t2 so we need to merge these keys
only_in_t1 = list(t1.keys() - t2.keys()) # only in t1
only_in_t2 = list(t2.keys() - t1.keys()) # only in t2

# now let's handle the merge keys
for key in merge_keys:
    
    # reference_alias is special
    # we ALWAYS should combine these two
    if key == 'reference_alias':
        out[key] = t1[key]
        if t1[key] != t2[key]:
            # only add t2 values if they aren't already in it
            bibcodes = {ref['name'] for ref in t1[key]}
            for val in t2[key]:
                if val['name'] not in bibcodes:
                    out[key].append(val)
        continue
        
    # we can skip this merge process and just add the values from t1 
    # if they are equal. We should still add the new reference though!
    if t1[key] == t2[key]:
        # set the value
        # we don't need to worry about references because this will
        # only be true if the reference is also equal!
        out[key] = t1[key]
        continue
        
    # There are some special keys that we are expecting
    if key == 'name':
        _merge_names(t1, t2, out)               
    elif key == 'coordinate':
        _merge_coords(t1, t2, out)
    elif key == 'epoch':
        _merge_epoch(t1, t2, out)
    elif key == 'distance':
        _merge_distance(t1, t2, out)
    elif key == 'filter_alias':
        _merge_filter_alias(t1, t2, out)
    elif key == 'schema_version':
        _merge_schema_version(t1, t2, out)
    elif key == 'photometry':
        _merge_photometry(t1, t2, out)
    elif key == 'spectra':
        _merge_spectra(t1, t2, out)
    elif key == 'classification':
        _merge_class(t1, t2, out)
    else:
        # this is an unexpected key! 
        # Throw a warning and only keep the old stuff
        warnings.warn(f'{key} was not expected! Only keeping the old information!')
        out[key] = deepcopy(t1[key])

# and now combining out with the stuff only in t1 and t2
out = out | dict(t1[only_in_t1]) | dict(t2[only_in_t2])

out

{'schema_version': {'value': 100, 'comment': 'Original Dataset'},
 'name': {'default_name': '2022xkq',
  'alias': [{'value': 'ASASSN-14li', 'reference': ['ASASSN']},
   {'value': 'foo', 'reference': ['x']},
   {'value': '2022xkq', 'reference': ['x']}]},
 'epoch': {'date_peak': [{'value': 56983,
    'date_format': 'MJD',
    'reference': ['2016ApJ...819L..25A',
     '2016Sci...351...62V',
     '2016ApJ...832L..10R',
     '2018MNRAS.475.4011B'],
    'computed': False}],
  'date_discovery': [{'value': 56983,
    'date_format': 'MJD',
    'reference': ['2016ApJ...819L..25A',
     '2016Sci...351...62V',
     '2016ApJ...832L..10R',
     '2018MNRAS.475.4011B'],
    'computed': False},
   {'value': 56984,
    'date_format': 'MJD',
    'reference': ['2016ApJ...819L..25A',
     '2016Sci...351...62V',
     '2016ApJ...832L..10R',
     '2018MNRAS.475.4011B'],
    'computed': False}]},
 '_rev': '_g7YzVH6---',
 '_key': '2824631',
 '_id': 'tdes/2824631',
 'distance': {'redshift': [{'value': '0.0206',
