In [1]:
import numpy as np
import pandas as pd
from penquins import Kowalski
from scope.utils import write_parquet

# Authenticate kowalski, gloria (first time)

In [2]:
kowalski = Kowalski(username = '',
                    password = '',
                    host = 'kowalski.caltech.edu',
                   )


gloria = Kowalski(username = '',
                  password = '',
                  host = 'gloria.caltech.edu',
                 )

In [3]:
# Generate tokens (no need to do this every time)
k_token = kowalski.authenticate()
g_token = gloria.authenticate()

# Print these tokens to add them to your config.yaml file

In [4]:
# Fill in tokens here after first authentication
tokens = {'kowalski': k_token,
          'gloria': g_token,
         }

# Create kowalski, gloria instances using token

In [5]:
hosts = ['kowalski', 'gloria']

instances = {
    host: {
        'protocol': 'https',
        'port': 443,
        'host': f'{host}.caltech.edu',
        'token': tokens[host],
    }
    for host in hosts
}

kowalski_instances = Kowalski(instances=instances)

In [6]:
# Test connection
for host in hosts:
    print(host, kowalski_instances.ping(name=host))

kowalski True
gloria True


# Get Kowalski, Gloria catalog names

In [7]:
query = {
    "query_type": "info",
    "query": {
        "command": "catalog_names",
    }
}

# For queries without a catalog, specify the instance name

response_k = kowalski_instances.query(query=query, name='kowalski')
data_k = response_k.get("kowalski").get("data")

response_g = kowalski_instances.query(query=query, name='gloria')
data_g = response_g.get("gloria").get("data")

print(response_g)

{'gloria': {'status': 'success', 'message': 'Successfully executed query', 'data': ['ZTF_sources_20210401', 'ZTF_source_features_DR5_20_fields', 'ZTF_source_features_DR5', 'ZTF_source_features_DR4_20_fields', 'ZTF_source_features_DR3', 'ZTF_source_features_20201201', 'ZTF_source_classifications_DR5', 'ZTF_public_sources_20210401', 'ZTF_ops', 'ZTF_exposures_86625709', 'ZTF_exposures_22993824', 'ZTF_exposures_20210401', 'ZTF_alerts', 'WNTR_alerts', 'VLASS_DR1', 'TURBO_alerts', 'TNS', 'PTF_sources', 'PTF_exposures', 'PS1_DR1', 'PGIR_alerts', 'IGAPS_DR2', 'Gaia_EDR3', 'GALEX', 'AllWISE']}}


In [8]:
data_k, data_g

(['sdss_ellipticals',
  'mzls_ellipticals',
  'milliquas_v6',
  'legacysurveys_photoz_DR7',
  'legacysurveys_photoz_DR6',
  'galaxy_redshifts_20200522',
  'cfht_w3_photozs',
  'ZUDS_alerts_aux',
  'ZUDS_alerts',
  'ZTF_sources_20200401',
  'ZTF_source_features_20201201',
  'ZTF_source_features_20200401_20_fields',
  'ZTF_source_features_20191101_20_fields',
  'ZTF_source_features_20191101',
  'ZTF_source_classifications_dr2_20_fields',
  'ZTF_source_classifications_dr2',
  'ZTF_source_classifications_20191101',
  'ZTF_ops',
  'ZTF_exposures_20201201',
  'ZTF_exposures_20200401',
  'ZTF_exposures_20191101',
  'ZTF_alerts_aux',
  'ZTF_alerts',
  'VLASS_DR1',
  'TNS',
  'TGSS_ADR1',
  'RFC_2019d',
  'RFC_2019a',
  'PS1_STRM',
  'PS1_DR1',
  'PGIR_alerts_aux',
  'PGIR_alerts',
  'NVSS_41',
  'NED_BetaV3',
  'LAMOST_DR5_v3',
  'LAMOST_DR4_v2',
  'Known_lenses_20180901',
  'IPHAS_DR2',
  'IGAPS_DR2',
  'Gaia_EDR3',
  'Gaia_DR2_light_curves',
  'Gaia_DR2_WD',
  'Gaia_DR2_2MASS_best_neighbour'

# Query Kowalski/Gloria for ZTF high-confidence IDs

In [None]:
# For queries with a catalog, no need to specify instance name

qry = kowalski_instances.query({
            "query_type": "find", # find documents
            "query": {
            "catalog": "ZTF_source_classifications_DR5", # catalog name
            "filter": {
                "$or":[
                    {'%s_xgb' % 'vnv': {'$gt': 0.9}}, # filter by vnv_xgb > 0.9 or vnv_dnn > 0.9
                                {'%s_dnn' % 'vnv': {'$gt': 0.9}},
                ]}
            },
            "kwargs": {
                       "max_time_ms": 10000 # adjustable max time
            },
            "projection": {"_id": 1 # only return the ids
                          }
            })
#data = qry.get('data')

#### Query times out - we have to query in batches 

# Query Kowalski/Gloria for ZTF DR5 classifications

## Get estimated document count

In [9]:
qry = kowalski_instances.query({
    "query_type": "estimated_document_count",
    "query": {
    "catalog": "ZTF_source_classifications_DR5"
        }
    }
)

estimated_nlightcurves = qry.get('gloria').get('data')
estimated_nlightcurves

45038327

In [10]:
qry = kowalski_instances.query({
    "query_type": "estimated_document_count",
    "query": {
    "catalog": "ZTF_source_classifications_DR5"
        }
    }
)

for instance_name in qry:
    result = qry.get(instance_name).get('data')
    print(instance_name, result)

result

gloria 45038327


45038327

## Get collection info

In [None]:
qry = kowalski_instances.query({
    "query_type": "info",
    "query": {
    "catalog": "ZTF_source_classifications_DR5",
        "command": "catalog_info"
        }
    }
)

In [None]:
qry

## Count number of high-vnv-confidence documents

In [11]:
qry = kowalski_instances.query({
    "query_type": "count_documents",
    "query": {
    "catalog": "ZTF_source_classifications_DR5",
    "filter": {
        "$or":[
                {'%s_xgb' % 'vnv': {'$gt': 0.95}},
                {'%s_dnn' % 'vnv': {'$gt': 0.95}},
        ]}
    }
})

nlightcurves = qry.get('gloria').get('data')
nlightcurves

325451

## Note ability to loop over all instances without specifically naming them

In [12]:
qry = kowalski_instances.query({
    "query_type": "count_documents",
    "query": {
    "catalog": "ZTF_source_classifications_DR5",
    "filter": {
        "$or":[
                {'%s_xgb' % 'vnv': {'$gt': 0.95}},
                {'%s_dnn' % 'vnv': {'$gt': 0.95}},
        ]}
    }
})

# If ZTF_source_classifications_DR5 were on multiple instances, queries could be split between them
nlightcurves = 0

# Loop over instances, updating quantity of interest
for instance_name in qry:
    result = qry.get(instance_name).get('data')
    nlightcurves += result
    print(instance_name, result)

nlightcurves

gloria 325451


325451

# Get scores using batch query

In [13]:
ids = []
batch_size=10000
nb = np.ceil(nlightcurves/batch_size).astype(int)
df_all = []

print(f'{nb}:')
for n in range(nb):
    print(n)
    qry = kowalski_instances.query({
        "query_type": "find",
        "query": {
            "catalog": "ZTF_source_classifications_DR5",
            "filter": {
                "$or":[
                    {'%s_xgb' % 'vnv': {'$gt': 0.95}},
                    {'%s_dnn' % 'vnv': {'$gt': 0.95}},
                ]}
            },
        "kwargs": {"skip": int(n*batch_size),
                   "limit": int(batch_size),
                   "max_time_ms": 10000
            }
        })
    data = qry.get('gloria').get('data')
    df_tmp = pd.DataFrame.from_records(data)
    df_all += [df_tmp]
    
df_scores = pd.concat(df_all).reset_index(drop=True)
    

33:
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


In [14]:
# examine last entry of data
data[-1]

{'_id': 10717372012093,
 'agn_xgb': 0.278889924287796,
 'e_xgb': 0.0066316258162260056,
 'bis_xgb': 0.0006812149658799171,
 'blyr_xgb': 2.987026164191775e-05,
 'ceph_xgb': 0.0033750066068023443,
 'dscu_xgb': 0.002485285745933652,
 'ea_xgb': 0.0013688580365851521,
 'eb_xgb': 3.5585457226261497e-05,
 'el_xgb': 3.955611191486241e-06,
 'ew_xgb': 0.0019701733253896236,
 'fla_xgb': 0.013243076391518116,
 'i_xgb': 0.02346765622496605,
 'lpv_xgb': 1.5801808331161737e-05,
 'msms_xgb': 1.0803507393575273e-05,
 'osarg_xgb': 2.5644334527896717e-05,
 'pnp_xgb': 0.42258286476135254,
 'puls_xgb': 0.4281094968318939,
 'rrlyr_xgb': 0.013531028293073177,
 'rrlyrab_xgb': 0.007637788541615009,
 'rrlyrc_xgb': 0.001014095963910222,
 'rrlyrd_xgb': 0.00027841716655530035,
 'rscvn_xgb': 0.003652181476354599,
 'saw_xgb': 0.08130893111228943,
 'sin_xgb': 0.003036600537598133,
 'srv_xgb': 0.13751941919326782,
 'vnv_xgb': 0.6603909134864807,
 'wuma_xgb': 0.0022214772179722786,
 'yso_xgb': 0.29807427525520325,
 'ag

In [15]:
df_scores

Unnamed: 0,_id,agn_xgb,e_xgb,bis_xgb,blyr_xgb,ceph_xgb,dscu_xgb,ea_xgb,eb_xgb,el_xgb,...,ew_dnn,fla_dnn,i_dnn,pnp_dnn,puls_dnn,rrlyr_dnn,rscvn_dnn,srv_dnn,vnv_dnn,yso_dnn
0,10682173007525,0.002441,0.989391,0.995640,0.002024,0.000069,0.000601,0.001321,0.009229,0.000003,...,0.602747,0.076606,0.052485,0.929672,0.024490,0.071283,0.255771,0.004792,0.995866,0.016794
1,10563032034383,0.002441,0.863971,0.688059,0.003376,0.000466,0.000273,0.000646,0.001388,0.000008,...,0.600870,0.001860,0.040677,0.951871,0.465572,0.205893,0.391641,0.004092,0.994549,0.002346
2,10852041010293,0.002441,0.993425,0.997304,0.000126,0.000146,0.000541,0.000481,0.000244,0.000017,...,0.946621,0.004293,0.036726,0.980536,0.312045,0.113829,0.135175,0.001828,0.994974,0.022615
3,10487402158260,0.002441,0.998359,0.999413,0.000714,0.000032,0.000405,0.000535,0.000815,0.000015,...,0.927954,0.001444,0.030237,0.967157,0.072791,0.180559,0.070040,0.000192,0.996949,0.000157
4,10718391003193,0.002441,0.999245,0.999622,0.000678,0.000015,0.000275,0.000450,0.000521,0.000006,...,0.954129,0.002294,0.017033,0.983989,0.015397,0.036002,0.047459,0.000336,0.999765,0.001013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325446,10488201039668,0.003449,0.006585,0.008458,0.000434,0.000937,0.001166,0.003357,0.001712,0.000076,...,0.131830,0.258965,0.639029,0.270851,0.253857,0.051995,0.334634,0.889911,0.950017,0.946182
325447,10488412001443,0.002778,0.094560,0.052289,0.000126,0.000068,0.000701,0.000508,0.000181,0.000020,...,0.335419,0.131294,0.180966,0.198831,0.109239,0.268254,0.004740,0.005331,0.950015,0.204993
325448,10488432106226,0.002601,0.007265,0.018382,0.000013,0.000031,0.001085,0.000464,0.000026,0.000030,...,0.049861,0.084772,0.088852,0.139352,0.000260,0.020050,0.000313,0.002851,0.950010,0.002088
325449,10488421028424,0.002996,0.527637,0.418548,0.003332,0.002131,0.000486,0.005854,0.003614,0.005199,...,0.399582,0.266747,0.602369,0.630165,0.179062,0.063756,0.200599,0.907293,0.950005,0.948648


In [16]:
df_scores.columns

Index(['_id', 'agn_xgb', 'e_xgb', 'bis_xgb', 'blyr_xgb', 'ceph_xgb',
       'dscu_xgb', 'ea_xgb', 'eb_xgb', 'el_xgb', 'ew_xgb', 'fla_xgb', 'i_xgb',
       'lpv_xgb', 'msms_xgb', 'osarg_xgb', 'pnp_xgb', 'puls_xgb', 'rrlyr_xgb',
       'rrlyrab_xgb', 'rrlyrc_xgb', 'rrlyrd_xgb', 'rscvn_xgb', 'saw_xgb',
       'sin_xgb', 'srv_xgb', 'vnv_xgb', 'wuma_xgb', 'yso_xgb', 'agn_dnn',
       'bis_dnn', 'blyr_dnn', 'ceph_dnn', 'dscu_dnn', 'e_dnn', 'ea_dnn',
       'eb_dnn', 'ew_dnn', 'fla_dnn', 'i_dnn', 'pnp_dnn', 'puls_dnn',
       'rrlyr_dnn', 'rscvn_dnn', 'srv_dnn', 'vnv_dnn', 'yso_dnn'],
      dtype='object')

In [None]:
df_scores.to_csv('ZTF_DR5_vnv_gt0.9score_classifications.csv',index=False)

# Batch query to get features

In [17]:
source_ids = df_scores['_id'].values.tolist()
features_catalog = 'ZTF_source_features_DR5'
limit=1000

In [18]:
# Get features and dmdt from Gloria
id = 0
df_collection = []
dmdt_collection = []

while 1:
    query = {
        "query_type": "find",
        "query": {
            "catalog": features_catalog,
            "filter": {"_id": {"$in": source_ids[id * limit : (id + 1) * limit]}},
        },
    }
    response = kowalski_instances.query(query=query)
    source_data = response.get('gloria').get("data")

    if source_data is None:
        print(response)
        raise ValueError(f"No data found for source ids {source_ids}")

    df_temp = pd.DataFrame.from_records(source_data)
    df_collection += [df_temp]
    try:
        dmdt_temp = np.expand_dims(
            np.array([d for d in df_temp['dmdt'].values]), axis=-1
        )
    except Exception as e:
        print("Error", e)
        print(df_temp)
    dmdt_collection += [dmdt_temp]

    if ((id + 1) * limit) > len(source_ids):
        break
    id += 1
    if (id * limit) % 5000 == 0:
        print(id * limit, "done")

df_features = pd.concat(df_collection, axis=0)
dmdt = np.vstack(dmdt_collection)

5000 done
10000 done
15000 done
20000 done
25000 done
30000 done
35000 done
40000 done
45000 done
50000 done
55000 done
60000 done
65000 done
70000 done
75000 done
80000 done
85000 done
90000 done
95000 done
100000 done
105000 done
110000 done
115000 done
120000 done
125000 done
130000 done
135000 done
140000 done
145000 done
150000 done
155000 done
160000 done
165000 done
170000 done
175000 done
180000 done
185000 done
190000 done
195000 done
200000 done
205000 done
210000 done
215000 done
220000 done
225000 done
230000 done
235000 done
240000 done
245000 done
250000 done
255000 done
260000 done
265000 done
270000 done
275000 done
280000 done
285000 done
290000 done
295000 done
300000 done
305000 done
310000 done
315000 done
320000 done
325000 done


In [19]:
df_features 

Unnamed: 0,_id,AllWISE___id,AllWISE__ph_qual,AllWISE__w1mpro,AllWISE__w1sigmpro,AllWISE__w2mpro,AllWISE__w2sigmpro,AllWISE__w3mpro,AllWISE__w3sigmpro,AllWISE__w4mpro,...,roms,significance,skew,smallkurt,stetson_j,stetson_k,sw,welch_i,wmean,wstd
0,10296051000973,1.600197e+17,AABU,11.187,0.023,11.185,0.022,11.062,0.135,9.023,...,4.625000,97.765030,114.424396,2236.506294,38.249536,0.868645,0.906189,316.113600,13.000808,0.082622
1,10296052001893,1.600197e+17,AABU,11.187,0.023,11.185,0.022,11.062,0.135,9.023,...,6.125746,137.257889,69.310499,4469.140562,-79.369418,0.875632,0.705628,-509.791831,12.635400,0.092647
2,10296231003763,1.740182e+17,AABU,10.845,0.022,10.858,0.021,10.735,0.126,9.017,...,4.104021,75.630920,-7.845135,948.906288,15.611241,0.866148,0.950899,109.426372,12.289038,0.102297
3,10296312006200,1.420182e+17,AABU,13.256,0.025,13.275,0.030,11.970,0.273,8.691,...,5.397115,114.061401,70.303084,2579.232084,43.776211,0.884509,0.923027,330.052081,14.695383,0.078901
4,10296391001693,1.720167e+17,AACU,11.948,0.024,11.976,0.023,12.273,0.380,9.056,...,6.243889,82.965607,261.402524,7234.012488,-7.467630,0.851542,0.748122,-112.468225,13.745763,0.121387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,10853271004077,3.520168e+18,AAAA,7.727,0.024,7.790,0.020,7.446,0.017,7.251,...,1.215031,9.049563,0.522118,5.682516,52.631995,0.844953,0.968652,92.412234,19.650478,0.136985
447,10853362051810,,,,,,,,,,...,2.815747,28.617287,7.106633,445.060646,-8.294161,0.787977,0.975120,11.793089,18.539521,0.132196
448,10853491004502,1.726014e+14,AAUU,13.195,0.024,13.213,0.026,12.406,,9.175,...,1.361410,23.021761,2.346961,16.510690,108.321521,0.819219,0.450020,222.233433,16.403430,0.032189
449,10853551000258,3.551173e+18,AAAA,6.769,0.072,6.826,0.020,6.237,0.015,5.496,...,3.787972,73.148140,-88.917075,1684.796000,355.857568,0.795914,0.611187,2214.134806,16.363313,0.086819


In [20]:
df_features.columns

Index(['_id', 'AllWISE___id', 'AllWISE__ph_qual', 'AllWISE__w1mpro',
       'AllWISE__w1sigmpro', 'AllWISE__w2mpro', 'AllWISE__w2sigmpro',
       'AllWISE__w3mpro', 'AllWISE__w3sigmpro', 'AllWISE__w4mpro',
       'AllWISE__w4sigmpro', 'Gaia_EDR3___id',
       'Gaia_EDR3__astrometric_excess_noise', 'Gaia_EDR3__parallax',
       'Gaia_EDR3__parallax_error', 'Gaia_EDR3__phot_bp_mean_mag',
       'Gaia_EDR3__phot_bp_rp_excess_factor', 'Gaia_EDR3__phot_g_mean_mag',
       'Gaia_EDR3__phot_rp_mean_mag', 'Gaia_EDR3__pmdec',
       'Gaia_EDR3__pmdec_error', 'Gaia_EDR3__pmra', 'Gaia_EDR3__pmra_error',
       'PS1_DR1___id', 'PS1_DR1__gMeanPSFMag', 'PS1_DR1__gMeanPSFMagErr',
       'PS1_DR1__iMeanPSFMag', 'PS1_DR1__iMeanPSFMagErr',
       'PS1_DR1__qualityFlag', 'PS1_DR1__rMeanPSFMag',
       'PS1_DR1__rMeanPSFMagErr', 'PS1_DR1__yMeanPSFMag',
       'PS1_DR1__yMeanPSFMagErr', 'PS1_DR1__zMeanPSFMag',
       'PS1_DR1__zMeanPSFMagErr', 'ad', 'ccd', 'chi2red', 'coordinates', 'dec',
       'dmdt', 'f

In [21]:
# Merge features, labels
df_merge = pd.merge(df_scores, df_features, on='_id')

In [None]:
df_merge.to_csv('golden_merged_scores_features.csv',index=False)

In [None]:
write_parquet(df_merge, 'golden_merged_scores_features.parquet')